📄 integer_8cpp-source.html
字号:
00536 MulAcc(0, 1);00537 MulAcc(1, 0);00538 00539 SaveMulAcc(1, 2, 0);00540 MulAcc(1, 1);00541 MulAcc(0, 2);00542 00543 R[2] = c;00544 R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];00545 }00546 00547 <span class="keywordtype">void</span> Portable::Multiply8Bottom(word *R, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B)00548 {00549 dword p;00550 word c, d, e;00551 00552 p = (dword)A[0] * B[0];00553 R[0] = LOW_WORD(p);00554 c = HIGH_WORD(p);00555 d = e = 0;00556 00557 MulAcc(0, 1);00558 MulAcc(1, 0);00559 00560 SaveMulAcc(1, 2, 0);00561 MulAcc(1, 1);00562 MulAcc(0, 2);00563 00564 SaveMulAcc(2, 0, 3);00565 MulAcc(1, 2);00566 MulAcc(2, 1);00567 MulAcc(3, 0);00568 00569 SaveMulAcc(3, 0, 4);00570 MulAcc(1, 3);00571 MulAcc(2, 2);00572 MulAcc(3, 1);00573 MulAcc(4, 0);00574 00575 SaveMulAcc(4, 0, 5);00576 MulAcc(1, 4);00577 MulAcc(2, 3);00578 MulAcc(3, 2);00579 MulAcc(4, 1);00580 MulAcc(5, 0);00581 00582 SaveMulAcc(5, 0, 6);00583 MulAcc(1, 5);00584 MulAcc(2, 4);00585 MulAcc(3, 3);00586 MulAcc(4, 2);00587 MulAcc(5, 1);00588 MulAcc(6, 0);00589 00590 R[6] = c;00591 R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +00592 A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];00593 }00594 00595 <span class="preprocessor">#undef MulAcc</span>00596 <span class="preprocessor"></span><span class="preprocessor">#undef SaveMulAcc</span>00597 <span class="preprocessor"></span><span class="preprocessor">#undef SquAcc</span>00598 <span class="preprocessor"></span><span class="preprocessor">#undef SaveSquAcc</span>00599 <span class="preprocessor"></span>00600 <span class="comment">// CodeWarrior defines _MSC_VER</span>00601 <span class="preprocessor">#if defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86) && (_M_IX86<=700)</span>00602 <span class="preprocessor"></span>00603 <span class="keyword">class </span>PentiumOptimized : <span class="keyword">public</span> Portable00604 {00605 <span class="keyword">public</span>:00606 <span class="keyword">static</span> word __fastcall Add(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B, <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> N);00607 <span class="keyword">static</span> word __fastcall Subtract(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B, <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> N);00608 <span class="keyword">static</span> <span class="keyword">inline</span> <span class="keywordtype">void</span> Square4(word *R, <span class="keyword">const</span> word *A)00609 {00610 <span class="comment">// VC60 workaround: MSVC 6.0 has an optimization bug that makes</span>00611 <span class="comment">// (dword)A*B where either A or B has been cast to a dword before</span>00612 <span class="comment">// very expensive. Revisit this function when this</span>00613 <span class="comment">// bug is fixed.</span>00614 Multiply4(R, A, A);00615 }00616 };00617 00618 <span class="keyword">typedef</span> PentiumOptimized LowLevel;00619 00620 __declspec(naked) word __fastcall PentiumOptimized::Add(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B, <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> N)00621 {00622 __asm00623 {00624 push ebp00625 push ebx00626 push esi00627 push edi00628 00629 mov esi, [esp+24] ; N00630 mov ebx, [esp+20] ; B00631 00632 <span class="comment">// now: ebx = B, ecx = C, edx = A, esi = N</span>00633 00634 sub ecx, edx <span class="comment">// hold the distance between C & A so we can add this to A to get C</span>00635 xor eax, eax <span class="comment">// clear eax</span>00636 00637 sub eax, esi <span class="comment">// eax is a negative index from end of B</span>00638 lea ebx, [ebx+4*esi] <span class="comment">// ebx is end of B</span>00639 00640 sar eax, 1 <span class="comment">// unit of eax is now dwords; this also clears the carry flag</span>00641 jz loopend <span class="comment">// if no dwords then nothing to do</span>00642 00643 loopstart:00644 mov esi,[edx] <span class="comment">// load lower word of A</span>00645 mov ebp,[edx+4] <span class="comment">// load higher word of A</span>00646 00647 mov edi,[ebx+8*eax] <span class="comment">// load lower word of B</span>00648 lea edx,[edx+8] <span class="comment">// advance A and C</span>00649 00650 adc esi,edi <span class="comment">// add lower words</span>00651 mov edi,[ebx+8*eax+4] <span class="comment">// load higher word of B</span>00652 00653 adc ebp,edi <span class="comment">// add higher words</span>00654 inc eax <span class="comment">// advance B</span>00655 00656 mov [edx+ecx-8],esi <span class="comment">// store lower word result</span>00657 mov [edx+ecx-4],ebp <span class="comment">// store higher word result</span>00658 00659 jnz loopstart <span class="comment">// loop until eax overflows and becomes zero</span>00660 00661 loopend:00662 adc eax, 0 <span class="comment">// store carry into eax (return result register)</span>00663 pop edi00664 pop esi00665 pop ebx00666 pop ebp00667 ret 800668 }00669 }00670 00671 __declspec(naked) word __fastcall PentiumOptimized::Subtract(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B, <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> N)00672 {00673 __asm00674 {00675 push ebp00676 push ebx00677 push esi00678 push edi00679 00680 mov esi, [esp+24] ; N00681 mov ebx, [esp+20] ; B00682 00683 sub ecx, edx00684 xor eax, eax00685 00686 sub eax, esi00687 lea ebx, [ebx+4*esi]00688 00689 sar eax, 100690 jz loopend00691 00692 loopstart:00693 mov esi,[edx]00694 mov ebp,[edx+4]00695 00696 mov edi,[ebx+8*eax]00697 lea edx,[edx+8]00698 00699 sbb esi,edi00700 mov edi,[ebx+8*eax+4]00701 00702 sbb ebp,edi00703 inc eax00704 00705 mov [edx+ecx-8],esi00706 mov [edx+ecx-4],ebp00707 00708 jnz loopstart00709 00710 loopend:00711 adc eax, 000712 pop edi00713 pop esi00714 pop ebx00715 pop ebp00716 ret 800717 }00718 }00719 00720 <span class="preprocessor">#ifdef SSE2_INTRINSICS_AVAILABLE</span>00721 <span class="preprocessor"></span>00722 <span class="keyword">static</span> <span class="keywordtype">bool</span> GetSSE2Capability()00723 {00724 word32 b;00725 00726 __asm00727 {00728 mov eax, 100729 cpuid00730 mov b, edx00731 }00732 00733 <span class="keywordflow">return</span> (b & (1 << 26)) != 0;00734 }00735 00736 <span class="keywordtype">bool</span> g_sse2DetectionDone = <span class="keyword">false</span>, g_sse2Detected, g_sse2Enabled = <span class="keyword">true</span>;00737 00738 <span class="keywordtype">void</span> DisableSSE2()00739 {00740 g_sse2Enabled = <span class="keyword">false</span>;00741 }00742 00743 <span class="keyword">static</span> <span class="keyword">inline</span> <span class="keywordtype">bool</span> HasSSE2()00744 {00745 <span class="keywordflow">if</span> (g_sse2Enabled && !g_sse2DetectionDone)00746 {00747 g_sse2Detected = GetSSE2Capability();00748 g_sse2DetectionDone = <span class="keyword">true</span>;00749 }00750 <span class="keywordflow">return</span> g_sse2Enabled && g_sse2Detected;00751 }00752 00753 <span class="keyword">class </span>P4Optimized : <span class="keyword">public</span> PentiumOptimized00754 {00755 <span class="keyword">public</span>:00756 <span class="keyword">static</span> word __fastcall Add(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B, <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> N);00757 <span class="keyword">static</span> word __fastcall Subtract(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B, <span class="keywordtype">unsigned</span> <span class="keywordtype">int</span> N);00758 <span class="keyword">static</span> <span class="keywordtype">void</span> Multiply4(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B);00759 <span class="keyword">static</span> <span class="keywordtype">void</span> Multiply8(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B);00760 <span class="keyword">static</span> <span class="keyword">inline</span> <span class="keywordtype">void</span> Square4(word *R, <span class="keyword">const</span> word *A)00761 {00762 Multiply4(R, A, A);00763 }00764 <span class="keyword">static</span> <span class="keywordtype">void</span> Multiply8Bottom(word *C, <span class="keyword">const</span> word *A, <span class="keyword">const</span> word *B);00765 };00766 00767 <span class="keyword">static</span> <span class="keywordtype">void</span> __fastcall P4_Mul(__m128i *C, <span class="keyword">const</span> __m128i *A, <span class="keyword">const</span> __m128i *B)00768 {00769 __m128i a3210 = _mm_load_si128(A);00770 __m128i b3210 = _mm_load_si128(B);00771 00772 __m128i sum;00773 00774 __m128i z = _mm_setzero_si128();00775 __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);00776 C[0] = a2b2_a0b0;00777 00778 __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));00779 __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));00780 __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);00781 __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);00782 __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);00783 C[1] = _mm_add_epi64(a1b0, a0b1);00784 00785 __m128i a31 = _mm_srli_epi64(a3210, 32);00786 __m128i b31 = _mm_srli_epi64(b3210, 32);00787 __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);00788 C[6] = a3b3_a1b1;00789 00790 __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);00791 __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));00792 __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);00793 __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);00794 __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);00795 sum = _mm_add_epi64(a1b1, a0b2);00796 C[2] = _mm_add_epi64(sum, a2b0);00797 00798 __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));00799 __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));00800 __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);00801 __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);00802 __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);00803 __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);00804 __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);00805 __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);00806 __m128i sum1 = _mm_add_epi64(a3b0, a1b2);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -