SSE3和SSSE3 Intrinsics各函数介绍
                                                            生活随笔
收集整理的這篇文章主要介紹了
                                SSE3和SSSE3 Intrinsics各函数介绍
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.                        
                                SIMD相關(guān)頭文件包括:
//#include <ivec.h>//MMX
//#include <fvec.h>//SSE(also include ivec.h)
//#include <dvec.h>//SSE2(also include fvec.h)#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h>//SSSE3(include pmmintrin.h)
#include <smmintrin.h>//SSE4.1(include tmmintrin.h)
#include <nmmintrin.h>//SSE4.2(include smmintrin.h)
#include <wmmintrin.h>//AES(include nmmintrin.h)
#include <immintrin.h>//AVX(include wmmintrin.h)
#include <intrin.h>//(include immintrin.h)mmintrin.h為MMX 頭文件,其中__m64的定義為:
typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
{unsigned __int64    m64_u64;float               m64_f32[2];__int8              m64_i8[8];__int16             m64_i16[4];__int32             m64_i32[2];    __int64             m64_i64;unsigned __int8     m64_u8[8];unsigned __int16    m64_u16[4];unsigned __int32    m64_u32[2];
} __m64;xmmintrin.h為SSE 頭文件,此頭文件里包含MMX頭文件,其中__m128的定義為:
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {float               m128_f32[4];unsigned __int64    m128_u64[2];__int8              m128_i8[16];__int16             m128_i16[8];__int32             m128_i32[4];__int64             m128_i64[2];unsigned __int8     m128_u8[16];unsigned __int16    m128_u16[8];unsigned __int32    m128_u32[4];} __m128;emmintrin.h為SSE2頭文件,此頭文件里包含SSE頭文件,其中__m128i和__m128d的定義為:
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {__int8              m128i_i8[16];__int16             m128i_i16[8];__int32             m128i_i32[4];    __int64             m128i_i64[2];unsigned __int8     m128i_u8[16];unsigned __int16    m128i_u16[8];unsigned __int32    m128i_u32[4];unsigned __int64    m128i_u64[2];
} __m128i;typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d {double              m128d_f64[2];
} __m128d;pmmintrin.h為SSE3頭文件,其文件中各函數(shù)的介紹:
	/*New Single precision vector instructions*///a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=a0-b0, r1=a1+b1, r2=a2-b2, r3=a3+b3extern __m128 _mm_addsub_ps(__m128 a, __m128 b);//a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=a0+a1, r1=a2+a3, r2=b0+b1, r3=b2+b3extern __m128 _mm_hadd_ps(__m128 a, __m128 b);//a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=a0-a1, r1=a2-a3, r2=b0-b1, r3=b2-b3extern __m128 _mm_hsub_ps(__m128 a, __m128 b);//a=(a0, a1, a2, a3), 則r0=a1, r1=a1, r2=a3, r3=a3extern __m128 _mm_movehdup_ps(__m128 a);//a=(a0, a1, a2, a3), 則r0=a0, r1=a0, r2=a2, r3=a2extern __m128 _mm_moveldup_ps(__m128 a);/*New double precision vector instructions*///a=(a0, a1), b=(b0, b1), 則r0=a0-b0, r1=a1+b1extern __m128d _mm_addsub_pd(__m128d a, __m128d b);//a=(a0, a1), b=(b0, b1), 則r0=a0+a1, r1=b0+b1extern __m128d _mm_hadd_pd(__m128d a, __m128d b);//a=(a0, a1), b=(b0, b1), 則r0=a0-a1, r1=b0-b1extern __m128d _mm_hsub_pd(__m128d a, __m128d b);//r0=r1=dp[0]extern __m128d _mm_loaddup_pd(double const * dp);//a=(a0, a1),則r0=r1=a0extern __m128d _mm_movedup_pd(__m128d a);/*New unaligned integer vector load instruction*///load unaligned data using _mm_lddqu_si128 for best performance//If the address is not 16-byte aligned, the load begins at the //highest 16-byte-aligned address less than the address of Dataextern __m128i _mm_lddqu_si128(__m128i const *p);/*Miscellaneous new instructions,For _mm_monitor p goes in eax, extensions goes in ecx, hints goes in edx*///The monitor instruction sets up an address range for hardware monitoring.//The values of extensions and hints correspond to the values in ECX and EDX//used by the monitor instruction. They are reserved for future use and should//be zero for the SSE3-enabled processor. For more information, //see the Intel or AMD documentation as appropriate.extern void _mm_monitor(void const *p, unsigned extensions, unsigned hints);/*Miscellaneous new instructions,For _mm_mwait, extensions goes in ecx, hints goes in eax*///The mwait instruction instructs the processor to enter a wait state in which the//processor is instructed to monitor the address range between extensions and hints//and wait for an event or a store to that address range. The values of extensions //and hints are loaded into the ECX and EAX registers. For more information,//see the Intel or AMD documentation as appropriate.extern void _mm_mwait(unsigned extensions, unsigned hints);tmmintrin.h為SSSE3頭文件, 其 文件中各函數(shù)的介紹:
	/*Add horizonally packed [saturated] words, double words,{X,}MM2/m{128,64} (b) to {X,}MM1 (a).*///a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7)//則r0=a0+a1, r1=a2+a3, r2=a4+a5, r3=a6+a7, r4=b0+b1, r5=b2+b3, r6=b4+b5, r7=b6+b7 extern __m128i _mm_hadd_epi16 (__m128i a, __m128i b);//a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=a0+a1, r1=a2+a3, r2=b0+b1, r3=b2+b3extern __m128i _mm_hadd_epi32 (__m128i a, __m128i b);//SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x))//a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7)//則r0=SATURATE_16(a0+a1), ..., r3=SATURATE_16(a6+a7), //r4=SATURATE_16(b0+b1), ..., r7=SATURATE_16(b6+b7)extern __m128i _mm_hadds_epi16 (__m128i a, __m128i b);//a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=a0+a1, r1=a2+a3, r2=b0+b1, r3=b2+b3extern __m64 _mm_hadd_pi16 (__m64 a, __m64 b);//a=(a0, a1), b=(b0, b1), 則r0=a0+a1, r1=b0+b1extern __m64 _mm_hadd_pi32 (__m64 a, __m64 b);//SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x))//a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=SATURATE_16(a0+a1), r1=SATURATE_16(a2+a3), //r2=SATURATE_16(b0+b1), r3=SATURATE_16(b2+b3)extern __m64 _mm_hadds_pi16 (__m64 a, __m64 b);/*Subtract horizonally packed [saturated] words, double words,{X,}MM2/m{128,64} (b) from {X,}MM1 (a).*///a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7)//則r0=a0-a1, r1=a2-a3, r2=a4-a5, r3=a6-a7, r4=b0-b1, r5=b2-b3, r6=b4-b5, r7=b6-b7extern __m128i _mm_hsub_epi16 (__m128i a, __m128i b);//a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=a0-a1, r1=a2-a3, r2=b0-b1, r3=b2-b3extern __m128i _mm_hsub_epi32 (__m128i a, __m128i b);//SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x))//a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7)//則r0=SATURATE_16(a0-a1), ..., r3=SATURATE_16(a6-a7), //r4=SATURATE_16(b0-b1), ..., r7=SATURATE_16(b6-b7)extern __m128i _mm_hsubs_epi16 (__m128i a, __m128i b);//a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=a0-a1, r1=a2-a3, r2=b0-b1, r3=b2-b3extern __m64 _mm_hsub_pi16 (__m64 a, __m64 b);//a=(a0, a1), b=(b0, b1), 則r0=a0-a1, r1=b0-b1extern __m64 _mm_hsub_pi32 (__m64 a, __m64 b);//SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x))//a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=SATURATE_16(a0-a1), r1=SATURATE_16(a2-a3), //r2=SATURATE_16(b0-b1), r3=SATURATE_16(b2-b3)extern __m64 _mm_hsubs_pi16 (__m64 a, __m64 b);/*Multiply and add packed words,{X,}MM2/m{128,64} (b) to {X,}MM1 (a).*///SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x))//a=(a0, a1, a2, ..., a13, a14, a15), b=(b0, b1, b2, ..., b13, b14, b15)//則r0=SATURATE_16((a0*b0)+(a1*b1)), ..., r7=SATURATE_16((a14*b14)+(a15*b15))//Parameter a contains unsigned bytes. Parameter b contains signed bytes.extern __m128i _mm_maddubs_epi16 (__m128i a, __m128i b);//SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x))//a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7)//則r0=SATURATE_16((a0*b0)+(a1*b1)), ..., r3=SATURATE_16((a6*b6)+(a7*b7))//Parameter a contains unsigned bytes. Parameter b contains signed bytes.extern __m64 _mm_maddubs_pi16 (__m64 a, __m64 b);/*Packed multiply high integers with round and scaling,{X,}MM2/m{128,64} (b) to {X,}MM1 (a).*///a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7)//則r0=INT16(((a0*b0)+0x4000) >> 15), ..., r7=INT16(((a7*b7)+0x4000) >> 15)extern __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b);//a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=INT16(((a0*b0)+0x4000) >> 15), ..., r3=INT16(((a3*b3)+0x4000) >> 15)extern __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b);/*Packed shuffle bytes{X,}MM2/m{128,64} (b) by {X,}MM1 (a).*///SELECT(a, n) extracts the nth 8-bit parameter from a. The 0th 8-bit parameter//is the least significant 8-bits, b=(b0, b1, b2, ..., b13, b14, b15), b is mask//則r0 = (b0 & 0x80) ? 0 : SELECT(a, b0 & 0x0f), ...,//r15 = (b15 & 0x80) ? 0 : SELECT(a, b15 & 0x0f)extern __m128i _mm_shuffle_epi8 (__m128i a, __m128i b);//SELECT(a, n) extracts the nth 8-bit parameter from a. The 0th 8-bit parameter//is the least significant 8-bits, b=(b0, b1, ..., b7), b is mask//則r0= (b0 & 0x80) ? 0 : SELECT(a, b0 & 0x07),...,//r7=(b7 & 0x80) ? 0 : SELECT(a, b7 & 0x07)extern __m64 _mm_shuffle_pi8 (__m64 a, __m64 b);/*Packed byte, word, double word sign, {X,}MM2/m{128,64} (b) to{X,}MM1 (a).*///a=(a0, a1, a2, ..., a13, a14, a15), b=(b0, b1, b2, ..., b13, b14, b15)//則r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ...,//r15= (b15 < 0) ? -a15 : ((b15 == 0) ? 0 : a15)extern __m128i _mm_sign_epi8 (__m128i a, __m128i b);//a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7)//r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ...,//r7= (b7 < 0) ? -a7 : ((b7 == 0) ? 0 : a7)extern __m128i _mm_sign_epi16 (__m128i a, __m128i b);//a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ...,//r3= (b3 < 0) ? -a3 : ((b3 == 0) ? 0 : a3)extern __m128i _mm_sign_epi32 (__m128i a, __m128i b);//a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7)//則r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ...,//r7= (b7 < 0) ? -a7 : ((b7 == 0) ? 0 : a7)extern __m64 _mm_sign_pi8 (__m64 a, __m64 b);//a=(a0, a1, a2, a3), b=(b0, b1, b2, b3)//則r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ...,//r3= (b3 < 0) ? -a3 : ((b3 == 0) ? 0 : a3)extern __m64 _mm_sign_pi16 (__m64 a, __m64 b);//a=(a0, a1), b=(b0, b1), 則r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0),//r1= (b1 < 0) ? -a1 : ((b1 == 0) ? 0 : a1)extern __m64 _mm_sign_pi32 (__m64 a, __m64 b);/*Packed align and shift right by n*8 bits,{X,}MM2/m{128,64} (b) to {X,}MM1 (a).*///n: A constant that specifies how many bytes the interim result will be //shifted to the right, If n > 32, the result value is zero //CONCAT(a, b) is the 256-bit unsigned intermediate value that is a concatenation of //parameters a and b. The result is this intermediate value shifted right by n bytes.//則r= (CONCAT(a, b) >> (n * 8)) & 0xffffffffffffffffextern __m128i _mm_alignr_epi8 (__m128i a, __m128i b, int n);//n: An integer constant that specifies how many bytes to shift the interim //result to the right,If n > 16, the result value is zero//CONCAT(a, b) is the 128-bit unsigned intermediate value that is formed by //concatenating parameters a and b. The result value is the rightmost 64 bits after//shifting this intermediate result right by n bytes//則r = (CONCAT(a, b) >> (n * 8)) & 0xffffffffextern __m64 _mm_alignr_pi8 (__m64 a, __m64 b, int n);/*Packed byte, word, double word absolute value,{X,}MM2/m{128,64} (b) to {X,}MM1 (a).*///a=(a0, a1, a2, ..., a13, a14, a15)//則r0 = (a0 < 0) ? -a0 : a0, ..., r15 = (a15 < 0) ? -a15 : a15extern __m128i _mm_abs_epi8 (__m128i a);//a=(a0, a1, a2, a3, a4, a5, a6, a7)//則r0 = (a0 < 0) ? -a0 : a0, ..., r7 = (a7 < 0) ? -a7 : a7extern __m128i _mm_abs_epi16 (__m128i a);//a=(a0, a1, a2, a3)//則r0 = (a0 < 0) ? -a0 : a0, ..., r3 = (a3 < 0) ? -a3 : a3extern __m128i _mm_abs_epi32 (__m128i a);//a=(a0, a1, a2, a3, a4, a5, a6, a7)//則r0 = (a0 < 0) ? -a0 : a0, ..., r7 = (a7 < 0) ? -a7 : a7extern __m64 _mm_abs_pi8 (__m64 a);//a=(a0, a1, a2, a3)//則r0 = (a0 < 0) ? -a0 : a0, ..., r3 = (a3 < 0) ? -a3 : a3extern __m64 _mm_abs_pi16 (__m64 a);//a=(a0, a1), 則r0 = (a0 < 0) ? -a0 : a0, r1 = (a1 < 0) ? -a1 : a1extern __m64 _mm_abs_pi32 (__m64 a);總結(jié)
以上是生活随笔為你收集整理的SSE3和SSSE3 Intrinsics各函数介绍的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
 
                            
                        - 上一篇: MMX Intrinsics各函数介绍
- 下一篇: 设计模式之抽象工厂模式(Abstract
