/search.css" rel="stylesheet" type="text/css"/> /search.js">
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sse.h
Go to the documentation of this file.
1 /*
2 Copyright 2010-2011, D. E. Shaw Research.
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
7 met:
8 
9 * Redistributions of source code must retain the above copyright
10  notice, this list of conditions, and the following disclaimer.
11 
12 * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions, and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16 * Neither the name of D. E. Shaw Research nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32 #ifndef _Random123_sse_dot_h__
33 #define _Random123_sse_dot_h__
34 
35 #if R123_USE_SSE
36 
37 #if R123_USE_X86INTRIN_H
38 #include <x86intrin.h>
39 #endif
40 #if R123_USE_IA32INTRIN_H
41 #include <ia32intrin.h>
42 #endif
43 #if R123_USE_XMMINTRIN_H
44 #include <xmmintrin.h>
45 #endif
46 #if R123_USE_EMMINTRIN_H
47 #include <emmintrin.h>
48 #endif
49 #if R123_USE_SMMINTRIN_H
50 #include <smmintrin.h>
51 #endif
52 #if R123_USE_WMMINTRIN_H
53 #include <wmmintrin.h>
54 #endif
55 #if R123_USE_INTRIN_H
56 #include <intrin.h>
57 #endif
58 #ifdef __cplusplus
59 #include <iostream>
60 #include <limits>
61 #include <stdexcept>
62 #endif
63 
64 #if R123_USE_ASM_GNU
65 
66 /* bit25 of CX tells us whether AES is enabled. */
67 R123_STATIC_INLINE int haveAESNI(){
68  unsigned int eax, ebx, ecx, edx;
69  __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
70  "a" (1));
71  return (ecx>>25) & 1;
72 }
73 #elif R123_USE_CPUID_MSVC
74 R123_STATIC_INLINE int haveAESNI(){
75  int CPUInfo[4];
76  __cpuid(CPUInfo, 1);
77  return (CPUInfo[2]>>25)&1;
78 }
79 #else /* R123_USE_CPUID_??? */
80 #warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false"
81 R123_STATIC_INLINE int haveAESNI(){
82  return 0;
83 }
84 #endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */
85 
86 // There is a lot of annoying and inexplicable variation in the
87 // SSE intrinsics available in different compilation environments.
88 // The details seem to depend on the compiler, the version and
89 // the target architecture. Rather than insisting on
90 // R123_USE_feature tests for each of these in each of the
91 // compilerfeatures.h files we just keep the complexity localized
92 // to here...
93 #if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
94 /* Is there an intrinsic to assemble an __m128i from two 64-bit words?
95  If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel
96  added _mm_set_epi64x to icc version 12.1 in Jan 2012.
97 */
98 R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
99  union{
100  uint64_t u64;
101  uint32_t u32[2];
102  } u1, u0;
103  u1.u64 = v1;
104  u0.u64 = v0;
105  return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
106 }
107 #endif
108 /* _mm_extract_lo64 abstracts the task of extracting the low 64-bit
109  word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job
110  on 64-bit platforms. Unfortunately, both MSVC and Open64 fail
111  assertions in ut_M128.cpp and ut_carray.cpp when we use the
112  _mm_cvtsi128_si64 intrinsic. (See
113  https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug).
114  On 32-bit platforms, there's no MOVQ, so there's no intrinsic.
115  Finally, even if the intrinsic exists, it may be spelled with or
116  without the 'x'.
117 */
118 #if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
119 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
120  union{
121  uint64_t u64[2];
122  __m128i m;
123  }u;
124  _mm_store_si128(&u.m, si);
125  return u.u64[0];
126 }
127 #elif defined(__llvm__) || defined(__ICC)
128 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
129  return (uint64_t)_mm_cvtsi128_si64(si);
130 }
131 #else /* GNUC, others */
132 /* FWIW, gcc's emmintrin.h has had the 'x' spelling
133  since at least gcc-3.4.4. The no-'x' spelling showed up
134  around 4.2. */
135 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
136  return (uint64_t)_mm_cvtsi128_si64x(si);
137 }
138 #endif
139 #if defined(__GNUC__) && __GNUC__ < 4
140 /* the cast builtins showed up in gcc4. */
141 R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
142  return (__m128)si;
143 }
144 #endif
145 
146 #ifdef __cplusplus
147 
148 struct r123m128i{
149  __m128i m;
150 #if R123_USE_CXX11_UNRESTRICTED_UNIONS
151  // C++98 forbids a union member from having *any* constructors.
152  // C++11 relaxes this, and allows union members to have constructors
153  // as long as there is a "trivial" default construtor. So in C++11
154  // we can provide a r123m128i constructor with an __m128i argument, and still
155  // have the default (and hence trivial) default constructor.
156  r123m128i() = default;
157  r123m128i(__m128i _m): m(_m){}
158 #endif
159  r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
160  r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
161 #if R123_USE_CXX11_EXPLICIT_CONVERSIONS
162  // With C++0x we can attach explicit to the bool conversion operator
163  // to disambiguate undesired promotions. For g++, this works
164  // only in 4.5 and above.
165  explicit operator bool() const {return _bool();}
166 #else
167  // Pre-C++0x, we have to do something else. Google for the "safe bool"
168  // idiom for other ideas...
169  operator const void*() const{return _bool()?this:0;}
170 #endif
171  operator __m128i() const {return m;}
172 
173 private:
174 #if R123_USE_SSE4_1
175  bool _bool() const{ return !_mm_testz_si128(m,m); }
176 #else
177  bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
178 #endif
179 };
180 
181 R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
182  __m128i& c = v.m;
183  __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
184  c = _mm_add_epi64(c, zeroone);
185  //return c;
186 #if R123_USE_SSE4_1
187  __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
188  if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
189  __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
190  c = _mm_add_epi64(c, onezero);
191  }
192 #else
193  unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
194  // The low two bits of mask are 11 iff the low 64 bits of
195  // c are zero.
196  if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
197  __m128i onezero = _mm_set_epi64x(1,0);
198  c = _mm_add_epi64(c, onezero);
199  }
200 #endif
201  return v;
202 }
203 
204 R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){
205  __m128i c = lhs.m;
206  __m128i incr128 = _mm_set_epi64x(0, n);
207  c = _mm_add_epi64(c, incr128);
208  // return c; // NO CARRY!
209 
210  int64_t lo64 = _mm_extract_lo64(c);
211  if((uint64_t)lo64 < n)
212  c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
213  lhs.m = c;
214  return lhs;
215 }
216 
217 // We need this one because it's present, but never used in r123array1xm128i::incr
218 R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){
219  throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
220 
221 // The comparisons aren't implemented, but if we leave them out, and
222 // somebody writes, e.g., M1 < M2, the compiler will do an implicit
223 // conversion through void*. Sigh...
224 R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
225  throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
226 R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
227  throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
228 R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
229  throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
230 R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
231  throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
232 
233 R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){
234  return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
235 R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){
236  return !(lhs==rhs);}
237 R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
238  r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
239 R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
240  return !(lhs==rhs);}
241 R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
242  union{
243  uint64_t u64[2];
244  __m128i m;
245  }u;
246  _mm_storeu_si128(&u.m, m.m);
247  return os << u.u64[0] << " " << u.u64[1];
248 }
249 
250 R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
251  uint64_t u64[2];
252  is >> u64[0] >> u64[1];
253  m.m = _mm_set_epi64x(u64[1], u64[0]);
254  return is;
255 }
256 
257 template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration
258 
259 template <>
261  r123m128i ret;
262  ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
263  return ret;
264 }
265 
266 #else
267 
268 typedef struct {
269  __m128i m;
270 } r123m128i;
271 
272 #endif /* __cplusplus */
273 
274 #else /* !R123_USE_SSE */
275 R123_STATIC_INLINE int haveAESNI(){
276  return 0;
277 }
278 #endif /* R123_USE_SSE */
279 
280 #endif /* _Random123_sse_dot_h__ */
static std::ostream & operator<<(std::ostream &os, const r123m128i &m)
Definition: sse.h:241
static bool operator==(const r123m128i &lhs, const r123m128i &rhs)
Definition: sse.h:233
static uint64_t _mm_extract_lo64(__m128i si)
Definition: sse.h:119
static bool operator!=(const r123m128i &lhs, const r123m128i &rhs)
Definition: sse.h:235
static r123m128i & operator++(r123m128i &v)
Definition: sse.h:181
static std::istream & operator>>(std::istream &is, r123m128i &m)
Definition: sse.h:250
Definition: sse.h:148
static bool operator>(const r123m128i &, const r123m128i &)
Definition: sse.h:228
static bool operator>=(const r123m128i &, const r123m128i &)
Definition: sse.h:230
r123m128i assemble_from_u32< r123m128i >(uint32_t *p32)
Definition: sse.h:260
T assemble_from_u32(uint32_t *p32)
static r123m128i & operator+=(r123m128i &lhs, R123_ULONG_LONG n)
Definition: sse.h:204
static int haveAESNI()
Definition: sse.h:81
static bool operator<=(R123_ULONG_LONG, const r123m128i &)
Definition: sse.h:218
static bool operator<(const r123m128i &, const r123m128i &)
Definition: sse.h:224
r123m128i & operator=(R123_ULONG_LONG n)
Definition: sse.h:160
r123m128i & operator=(const __m128i &rhs)
Definition: sse.h:159
__m128i m
Definition: sse.h:149