SSE指令集
SSE指令集的介紹網上一大堆, 這里貼一個用VS2008環境下的SSE測試程序, 分別用C++代碼, C++內聯匯編, C++的SSE Intrinsics三種方式計算卷積的程序...這是一個win32控制臺程序.....
程序下載地址 :?http://download.csdn.net/detail/hemmingway/4598506
主文件的代碼一覽:
// Test_SSE.cpp : 定義控制臺應用程序的入口點。 // calc conversion //#include "stdafx.h" #include <xmmintrin.h> // __m128 data type and SSE functions #include <float.h> #include <math.h> #include <Windows.h> // Support odprintf #include <stdarg.h> #include <ctype.h> #include "MMX_SSESupport.h" #include "TimeCounter.h"#define ARRAY_SIZE 100000#pragma warning(disable : 4324)// Arrays processed by SSE should have 16 bytes alignment: __declspec(align(16)) float m_fInitialArray[ARRAY_SIZE]; __declspec(align(16)) float m_fResultArray[ARRAY_SIZE];// minimum and maximum values in the result array float m_fMin; float m_fMax;#define TIME_START CTimeCounter* pT = new CTimeCounter() #define TIME_END ShowTime(pT->GetExecutionTime())// //odprintf -- debug function void __cdecl odprintf(const char* fmt, ...) {char buf[4096], *p = buf;va_list args;va_start(args, fmt);p += vsnprintf_s(p, sizeof(buf), _TRUNCATE, fmt, args);va_end(args);while ( p > buf && isspace(p[-1]) )*--p = '\0';*p++ = '\r';*p++ = '\n';*p = '\0';OutputDebugStringA(buf); //output as ANSI string //OutputDebugString }// // Show execution time (ms) void ShowTime(__int64 nTime) {printf("usage time: %I64d\n\n",nTime); //在g++中對應的是<stdint.h> int64_t, 應該用%lld輸出 }// // ShowArray, display array's data void ShowArray(float* pArray) {if ( !(*pArray))return;float* p = pArray;for ( int i = 0; i < ARRAY_SIZE; i += 500 ) //沒有顯示所有的數據出來{printf("%f ", p[i]);if (i == 5)printf("\n");}printf("\n\n"); }// // InitArray, Fill initial array void InitArray() {m_fMin = FLT_MAX;m_fMax = FLT_MIN;float f;int i;for ( i = 0; i < ARRAY_SIZE; i++ ){// Fill array with one sin cycle and ensure that all values are positive// (to use sqrt in conversion)f = (float) sin(((double)i * 6.29 / ARRAY_SIZE)) + 2.0f;if ( f < m_fMin )m_fMin = f;if ( f > m_fMax )m_fMax = f;m_fInitialArray[i] = f;}ShowArray(m_fInitialArray); }// // Make conversion using C++ code // // Each initial array member is converted to result array member // using some formula (just to demonstrate SSE features). // Minimum and maximum result values are calculated and shown. // // Function also calculates and shows conversion time (ms). // void OnCplusplus() {TIME_START;m_fMin = FLT_MAX;m_fMax = FLT_MIN;int i;for ( i = 0; i < ARRAY_SIZE; i++ ){m_fResultArray[i] = sqrt(m_fInitialArray[i] * 2.8f);if ( m_fResultArray[i] < m_fMin )m_fMin = m_fResultArray[i];if ( m_fResultArray[i] > m_fMax )m_fMax = m_fResultArray[i];}TIME_END;ShowArray(m_fResultArray); }// //OnSseAssembly, Make conversion using C++ code with inline Assembly void OnSseAssembly() {TIME_START;float* pIn = m_fInitialArray;float* pOut = m_fResultArray;float f = 2.8f;float flt_min = FLT_MIN;float flt_max = FLT_MAX;__m128 min128;__m128 max128;// using additional registers:// xmm2 - multiplication coefficient// xmm3 - minimum// xmm4 - maximum_asm{movss xmm2, f // xmm2[0] = 2.8shufps xmm2, xmm2, 0 // xmm2[1, 2, 3] = xmm2[0]movss xmm3, flt_max // xmm3 = FLT_MAXshufps xmm3, xmm3, 0 // xmm3[1, 2, 3] = xmm3[0]movss xmm4, flt_min // xmm4 = FLT_MINshufps xmm4, xmm4, 0 // xmm3[1, 2, 3] = xmm3[0]mov esi, pIn // input pointermov edi, pOut // output pointermov ecx, ARRAY_SIZE/4 // loop counterstart_loop:movaps xmm1, [esi] // xmm1 = [esi]mulps xmm1, xmm2 // xmm1 = xmm1 * xmm2sqrtps xmm1, xmm1 // xmm1 = sqrt(xmm1)movaps [edi], xmm1 // [edi] = xmm1minps xmm3, xmm1maxps xmm4, xmm1add esi, 16add edi, 16dec ecxjnz start_loopmovaps min128, xmm3movaps max128, xmm4}// extract minimum and maximum values from min128 and max128union u{__m128 m;float f[4];} x;x.m = min128;m_fMin = min(x.f[0], min(x.f[1], min(x.f[2], x.f[3])));x.m = max128;m_fMax = max(x.f[0], max(x.f[1], max(x.f[2], x.f[3])));TIME_END;ShowArray(m_fResultArray); }// // OnSseCpp, Make conversion using C++ code with SSE Intrinsics void OnSseCpp() {TIME_START;__m128 coeff = _mm_set_ps1(2.8f); // coeff[0, 1, 2, 3] = 2.8__m128 tmp;__m128 min128 = _mm_set_ps1(FLT_MAX); // min128[0, 1, 2, 3] = FLT_MAX__m128 max128 = _mm_set_ps1(FLT_MIN); // max128[0, 1, 2, 3] = FLT_MIN__m128* pSource = (__m128*) m_fInitialArray;__m128* pDest = (__m128*) m_fResultArray;for ( int i = 0; i < ARRAY_SIZE/4; i++ ){tmp = _mm_mul_ps(*pSource, coeff); // tmp = *pSource * coeff*pDest = _mm_sqrt_ps(tmp); // *pDest = sqrt(tmp)min128 = _mm_min_ps(*pDest, min128);max128 = _mm_max_ps(*pDest, max128);pSource++;pDest++;}// extract minimum and maximum values from min128 and max128union u{__m128 m;float f[4];} x;x.m = min128;m_fMin = min(x.f[0], min(x.f[1], min(x.f[2], x.f[3])));x.m = max128;m_fMax = max(x.f[0], max(x.f[1], max(x.f[2], x.f[3])));TIME_END;ShowArray(m_fResultArray); }int _tmain(int argc, _TCHAR* argv[]) {// Test SSE support ?bool bMMX, bSSE;TestFeatures(&bMMX, &bSSE);if ( !bSSE ){// Do not support SSEodprintf("Do not support SSE.\n");return 0;}odprintf("everything is ok...");//first, prepare dataprintf("program generate %d floating point(Not all data are displayed)...\n\n", ARRAY_SIZE);InitArray();//second, Make conversion using C++ codegetchar();printf("Make conversion using C++ code\n\n");OnCplusplus();//third,Make conversion using C++ code with inline Assemblygetchar();printf("Make conversion using C++ code with inline Assembly\n\n");OnSseAssembly();//finally, Make conversion using C++ code with SSE Intrinsics getchar();printf("Make conversion using C++ code with SSE Intrinsics\n\n");OnSseCpp();getchar();return 0; }
總結
- 上一篇: 快手年龄怎么隐藏
- 下一篇: 两个C++毫秒级定时器