1 // Written in the D programming language.
2 
3 /**
4  * Builtin SIMD intrinsics
5  *
6  * Source: $(DRUNTIMESRC core/_simd.d)
7  *
8  * Copyright: Copyright Digital Mars 2012-2020
9  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
10  * Authors:   $(HTTP digitalmars.com, Walter Bright),
11  * Source:    $(DRUNTIMESRC core/_simd.d)
12  */
13 
14 module core.simd;
15 
16 pure:
17 nothrow:
18 @safe:
19 @nogc:
20 
21 /*******************************
22  * Create a vector type.
23  *
24  * Parameters:
25  *      T = one of double[2], float[4], void[16], byte[16], ubyte[16],
26  *      short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
27  *      For 256 bit vectors,
28  *      one of double[4], float[8], void[32], byte[32], ubyte[32],
29  *      short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
30  */
31 
32 template Vector(T)
33 {
34     /* __vector is compiler magic, hide it behind a template.
35      * The compiler will reject T's that don't work.
36      */
37     alias __vector(T) Vector;
38 }
39 
40 /* Handy aliases
41  */
42 static if (is(Vector!(void[8])))    alias Vector!(void[8])    void8;        ///
43 static if (is(Vector!(double[1])))  alias Vector!(double[1])  double1;      ///
44 static if (is(Vector!(float[2])))   alias Vector!(float[2])   float2;       ///
45 static if (is(Vector!(byte[8])))    alias Vector!(byte[8])    byte8;        ///
46 static if (is(Vector!(ubyte[8])))   alias Vector!(ubyte[8])   ubyte8;       ///
47 static if (is(Vector!(short[4])))   alias Vector!(short[4])   short4;       ///
48 static if (is(Vector!(ushort[4])))  alias Vector!(ushort[4])  ushort4;      ///
49 static if (is(Vector!(int[2])))     alias Vector!(int[2])     int2;         ///
50 static if (is(Vector!(uint[2])))    alias Vector!(uint[2])    uint2;        ///
51 static if (is(Vector!(long[1])))    alias Vector!(long[1])    long1;        ///
52 static if (is(Vector!(ulong[1])))   alias Vector!(ulong[1])   ulong1;       ///
53 
54 static if (is(Vector!(void[16])))   alias Vector!(void[16])   void16;       ///
55 static if (is(Vector!(double[2])))  alias Vector!(double[2])  double2;      ///
56 static if (is(Vector!(float[4])))   alias Vector!(float[4])   float4;       ///
57 static if (is(Vector!(byte[16])))   alias Vector!(byte[16])   byte16;       ///
58 static if (is(Vector!(ubyte[16])))  alias Vector!(ubyte[16])  ubyte16;      ///
59 static if (is(Vector!(short[8])))   alias Vector!(short[8])   short8;       ///
60 static if (is(Vector!(ushort[8])))  alias Vector!(ushort[8])  ushort8;      ///
61 static if (is(Vector!(int[4])))     alias Vector!(int[4])     int4;         ///
62 static if (is(Vector!(uint[4])))    alias Vector!(uint[4])    uint4;        ///
63 static if (is(Vector!(long[2])))    alias Vector!(long[2])    long2;        ///
64 static if (is(Vector!(ulong[2])))   alias Vector!(ulong[2])   ulong2;       ///
65 
66 static if (is(Vector!(void[32])))   alias Vector!(void[32])   void32;       ///
67 static if (is(Vector!(double[4])))  alias Vector!(double[4])  double4;      ///
68 static if (is(Vector!(float[8])))   alias Vector!(float[8])   float8;       ///
69 static if (is(Vector!(byte[32])))   alias Vector!(byte[32])   byte32;       ///
70 static if (is(Vector!(ubyte[32])))  alias Vector!(ubyte[32])  ubyte32;      ///
71 static if (is(Vector!(short[16])))  alias Vector!(short[16])  short16;      ///
72 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16;     ///
73 static if (is(Vector!(int[8])))     alias Vector!(int[8])     int8;         ///
74 static if (is(Vector!(uint[8])))    alias Vector!(uint[8])    uint8;        ///
75 static if (is(Vector!(long[4])))    alias Vector!(long[4])    long4;        ///
76 static if (is(Vector!(ulong[4])))   alias Vector!(ulong[4])   ulong4;       ///
77 
78 static if (is(Vector!(void[64])))   alias Vector!(void[64])   void64;       ///
79 static if (is(Vector!(double[8])))  alias Vector!(double[8])  double8;      ///
80 static if (is(Vector!(float[16])))  alias Vector!(float[16])  float16;      ///
81 static if (is(Vector!(byte[64])))   alias Vector!(byte[64])   byte64;       ///
82 static if (is(Vector!(ubyte[64])))  alias Vector!(ubyte[64])  ubyte64;      ///
83 static if (is(Vector!(short[32])))  alias Vector!(short[32])  short32;      ///
84 static if (is(Vector!(ushort[32]))) alias Vector!(ushort[32]) ushort32;     ///
85 static if (is(Vector!(int[16])))    alias Vector!(int[16])    int16;        ///
86 static if (is(Vector!(uint[16])))   alias Vector!(uint[16])   uint16;       ///
87 static if (is(Vector!(long[8])))    alias Vector!(long[8])    long8;        ///
88 static if (is(Vector!(ulong[8])))   alias Vector!(ulong[8])   ulong8;       ///
89 
90 version (D_SIMD)
91 {
92     /** XMM opcodes that conform to the following:
93     *
94     *  opcode xmm1,xmm2/mem
95     *
96     * and do not have side effects (i.e. do not write to memory).
97     */
98     enum XMM
99     {
100         ADDSS = 0xF30F58,
101         ADDSD = 0xF20F58,
102         ADDPS = 0x000F58,
103         ADDPD = 0x660F58,
104         PADDB = 0x660FFC,
105         PADDW = 0x660FFD,
106         PADDD = 0x660FFE,
107         PADDQ = 0x660FD4,
108 
109         SUBSS = 0xF30F5C,
110         SUBSD = 0xF20F5C,
111         SUBPS = 0x000F5C,
112         SUBPD = 0x660F5C,
113         PSUBB = 0x660FF8,
114         PSUBW = 0x660FF9,
115         PSUBD = 0x660FFA,
116         PSUBQ = 0x660FFB,
117 
118         MULSS = 0xF30F59,
119         MULSD = 0xF20F59,
120         MULPS = 0x000F59,
121         MULPD = 0x660F59,
122         PMULLW = 0x660FD5,
123 
124         DIVSS = 0xF30F5E,
125         DIVSD = 0xF20F5E,
126         DIVPS = 0x000F5E,
127         DIVPD = 0x660F5E,
128 
129         PAND  = 0x660FDB,
130         POR   = 0x660FEB,
131 
132         UCOMISS = 0x000F2E,
133         UCOMISD = 0x660F2E,
134 
135         XORPS = 0x000F57,
136         XORPD = 0x660F57,
137 
138         // Use STO and LOD instead of MOV to distinguish the direction
139         // (Destination is first operand, Source is second operand)
140         STOSS  = 0xF30F11,        /// MOVSS xmm1/m32, xmm2
141         STOSD  = 0xF20F11,        /// MOVSD xmm1/m64, xmm2
142         STOAPS = 0x000F29,        /// MOVAPS xmm2/m128, xmm1
143         STOAPD = 0x660F29,        /// MOVAPD xmm2/m128, xmm1
144         STODQA = 0x660F7F,        /// MOVDQA xmm2/m128, xmm1
145         STOD   = 0x660F7E,        /// MOVD reg/mem64, xmm   66 0F 7E /r
146         STOQ   = 0x660FD6,        /// MOVQ xmm2/m64, xmm1
147 
148         LODSS  = 0xF30F10,        /// MOVSS xmm1, xmm2/m32
149         LODSD  = 0xF20F10,        /// MOVSD xmm1, xmm2/m64
150         LODAPS = 0x000F28,        /// MOVAPS xmm1, xmm2/m128
151         LODAPD = 0x660F28,        /// MOVAPD xmm1, xmm2/m128
152         LODDQA = 0x660F6F,        /// MOVDQA xmm1, xmm2/m128
153         LODD   = 0x660F6E,        /// MOVD xmm, reg/mem64   66 0F 6E /r
154         LODQ   = 0xF30F7E,        /// MOVQ xmm1, xmm2/m64
155 
156         LODDQU   = 0xF30F6F,      /// MOVDQU xmm1, xmm2/mem128  F3 0F 6F /r
157         STODQU   = 0xF30F7F,      /// MOVDQU xmm1/mem128, xmm2  F3 0F 7F /r
158         MOVDQ2Q  = 0xF20FD6,      /// MOVDQ2Q mmx, xmm          F2 0F D6 /r
159         MOVHLPS  = 0x0F12,        /// MOVHLPS xmm1, xmm2        0F 12 /r
160         LODHPD   = 0x660F16,      /// MOVHPD xmm1, m64
161         STOHPD   = 0x660F17,      /// MOVHPD mem64, xmm1        66 0F 17 /r
162         LODHPS   = 0x0F16,        /// MOVHPS xmm1, m64
163         STOHPS   = 0x0F17,        /// MOVHPS m64, xmm1
164         MOVLHPS  = 0x0F16,        /// MOVLHPS xmm1, xmm2
165         LODLPD   = 0x660F12,      /// MOVLPD xmm1, m64
166         STOLPD   = 0x660F13,      /// MOVLPD m64, xmm1
167         LODLPS   = 0x0F12,        /// MOVLPS xmm1, m64
168         STOLPS   = 0x0F13,        /// MOVLPS m64, xmm1
169         MOVMSKPD = 0x660F50,      /// MOVMSKPD reg, xmm
170         MOVMSKPS = 0x0F50,        /// MOVMSKPS reg, xmm
171         MOVNTDQ  = 0x660FE7,      /// MOVNTDQ m128, xmm1
172         MOVNTI   = 0x0FC3,        /// MOVNTI m32, r32
173         MOVNTPD  = 0x660F2B,      /// MOVNTPD m128, xmm1
174         MOVNTPS  = 0x0F2B,        /// MOVNTPS m128, xmm1
175         MOVNTQ   = 0x0FE7,        /// MOVNTQ m64, mm
176         MOVQ2DQ  = 0xF30FD6,      /// MOVQ2DQ
177         LODUPD   = 0x660F10,      /// MOVUPD xmm1, xmm2/m128
178         STOUPD   = 0x660F11,      /// MOVUPD xmm2/m128, xmm1
179         LODUPS   = 0x0F10,        /// MOVUPS xmm1, xmm2/m128
180         STOUPS   = 0x0F11,        /// MOVUPS xmm2/m128, xmm1
181 
182         PACKSSDW = 0x660F6B,
183         PACKSSWB = 0x660F63,
184         PACKUSWB = 0x660F67,
185         PADDSB = 0x660FEC,
186         PADDSW = 0x660FED,
187         PADDUSB = 0x660FDC,
188         PADDUSW = 0x660FDD,
189         PANDN = 0x660FDF,
190         PCMPEQB = 0x660F74,
191         PCMPEQD = 0x660F76,
192         PCMPEQW = 0x660F75,
193         PCMPGTB = 0x660F64,
194         PCMPGTD = 0x660F66,
195         PCMPGTW = 0x660F65,
196         PMADDWD = 0x660FF5,
197         PSLLW = 0x660FF1,
198         PSLLD = 0x660FF2,
199         PSLLQ = 0x660FF3,
200         PSRAW = 0x660FE1,
201         PSRAD = 0x660FE2,
202         PSRLW = 0x660FD1,
203         PSRLD = 0x660FD2,
204         PSRLQ = 0x660FD3,
205         PSUBSB = 0x660FE8,
206         PSUBSW = 0x660FE9,
207         PSUBUSB = 0x660FD8,
208         PSUBUSW = 0x660FD9,
209         PUNPCKHBW = 0x660F68,
210         PUNPCKHDQ = 0x660F6A,
211         PUNPCKHWD = 0x660F69,
212         PUNPCKLBW = 0x660F60,
213         PUNPCKLDQ = 0x660F62,
214         PUNPCKLWD = 0x660F61,
215         PXOR = 0x660FEF,
216         ANDPD = 0x660F54,
217         ANDPS = 0x0F54,
218         ANDNPD = 0x660F55,
219         ANDNPS = 0x0F55,
220         CMPPS = 0x0FC2,
221         CMPPD = 0x660FC2,
222         CMPSD = 0xF20FC2,
223         CMPSS = 0xF30FC2,
224         COMISD = 0x660F2F,
225         COMISS = 0x0F2F,
226         CVTDQ2PD = 0xF30FE6,
227         CVTDQ2PS = 0x0F5B,
228         CVTPD2DQ = 0xF20FE6,
229         CVTPD2PI = 0x660F2D,
230         CVTPD2PS = 0x660F5A,
231         CVTPI2PD = 0x660F2A,
232         CVTPI2PS = 0x0F2A,
233         CVTPS2DQ = 0x660F5B,
234         CVTPS2PD = 0x0F5A,
235         CVTPS2PI = 0x0F2D,
236         CVTSD2SI = 0xF20F2D,
237         CVTSD2SS = 0xF20F5A,
238         CVTSI2SD = 0xF20F2A,
239         CVTSI2SS = 0xF30F2A,
240         CVTSS2SD = 0xF30F5A,
241         CVTSS2SI = 0xF30F2D,
242         CVTTPD2PI = 0x660F2C,
243         CVTTPD2DQ = 0x660FE6,
244         CVTTPS2DQ = 0xF30F5B,
245         CVTTPS2PI = 0x0F2C,
246         CVTTSD2SI = 0xF20F2C,
247         CVTTSS2SI = 0xF30F2C,
248         MASKMOVDQU = 0x660FF7,
249         MASKMOVQ = 0x0FF7,
250         MAXPD = 0x660F5F,
251         MAXPS = 0x0F5F,
252         MAXSD = 0xF20F5F,
253         MAXSS = 0xF30F5F,
254         MINPD = 0x660F5D,
255         MINPS = 0x0F5D,
256         MINSD = 0xF20F5D,
257         MINSS = 0xF30F5D,
258         ORPD = 0x660F56,
259         ORPS = 0x0F56,
260         PAVGB = 0x660FE0,
261         PAVGW = 0x660FE3,
262         PMAXSW = 0x660FEE,
263         //PINSRW = 0x660FC4,
264         PMAXUB = 0x660FDE,
265         PMINSW = 0x660FEA,
266         PMINUB = 0x660FDA,
267         //PMOVMSKB = 0x660FD7,
268         PMULHUW = 0x660FE4,
269         PMULHW = 0x660FE5,
270         PMULUDQ = 0x660FF4,
271         PSADBW = 0x660FF6,
272         PUNPCKHQDQ = 0x660F6D,
273         PUNPCKLQDQ = 0x660F6C,
274         RCPPS = 0x0F53,
275         RCPSS = 0xF30F53,
276         RSQRTPS = 0x0F52,
277         RSQRTSS = 0xF30F52,
278         SQRTPD = 0x660F51,
279         SHUFPD = 0x660FC6,
280         SHUFPS = 0x0FC6,
281         SQRTPS = 0x0F51,
282         SQRTSD = 0xF20F51,
283         SQRTSS = 0xF30F51,
284         UNPCKHPD = 0x660F15,
285         UNPCKHPS = 0x0F15,
286         UNPCKLPD = 0x660F14,
287         UNPCKLPS = 0x0F14,
288 
289         PSHUFD = 0x660F70,
290         PSHUFHW = 0xF30F70,
291         PSHUFLW = 0xF20F70,
292         PSHUFW = 0x0F70,
293         PSLLDQ = 0x07660F73,
294         PSRLDQ = 0x03660F73,
295 
296         //PREFETCH = 0x0F18,
297 
298         // SSE3 Pentium 4 (Prescott)
299 
300         ADDSUBPD = 0x660FD0,
301         ADDSUBPS = 0xF20FD0,
302         HADDPD   = 0x660F7C,
303         HADDPS   = 0xF20F7C,
304         HSUBPD   = 0x660F7D,
305         HSUBPS   = 0xF20F7D,
306         MOVDDUP  = 0xF20F12,
307         MOVSHDUP = 0xF30F16,
308         MOVSLDUP = 0xF30F12,
309         LDDQU    = 0xF20FF0,
310         MONITOR  = 0x0F01C8,
311         MWAIT    = 0x0F01C9,
312 
313         // SSSE3
314         PALIGNR = 0x660F3A0F,
315         PHADDD = 0x660F3802,
316         PHADDW = 0x660F3801,
317         PHADDSW = 0x660F3803,
318         PABSB = 0x660F381C,
319         PABSD = 0x660F381E,
320         PABSW = 0x660F381D,
321         PSIGNB = 0x660F3808,
322         PSIGND = 0x660F380A,
323         PSIGNW = 0x660F3809,
324         PSHUFB = 0x660F3800,
325         PMADDUBSW = 0x660F3804,
326         PMULHRSW = 0x660F380B,
327         PHSUBD = 0x660F3806,
328         PHSUBW = 0x660F3805,
329         PHSUBSW = 0x660F3807,
330 
331         // SSE4.1
332 
333         BLENDPD   = 0x660F3A0D,
334         BLENDPS   = 0x660F3A0C,
335         BLENDVPD  = 0x660F3815,
336         BLENDVPS  = 0x660F3814,
337         DPPD      = 0x660F3A41,
338         DPPS      = 0x660F3A40,
339         EXTRACTPS = 0x660F3A17,
340         INSERTPS  = 0x660F3A21,
341         MPSADBW   = 0x660F3A42,
342         PBLENDVB  = 0x660F3810,
343         PBLENDW   = 0x660F3A0E,
344         PEXTRD    = 0x660F3A16,
345         PEXTRQ    = 0x660F3A16,
346         PINSRB    = 0x660F3A20,
347         PINSRD    = 0x660F3A22,
348         PINSRQ    = 0x660F3A22,
349 
350         MOVNTDQA = 0x660F382A,
351         PACKUSDW = 0x660F382B,
352         PCMPEQQ = 0x660F3829,
353         PEXTRB = 0x660F3A14,
354         PHMINPOSUW = 0x660F3841,
355         PMAXSB = 0x660F383C,
356         PMAXSD = 0x660F383D,
357         PMAXUD = 0x660F383F,
358         PMAXUW = 0x660F383E,
359         PMINSB = 0x660F3838,
360         PMINSD = 0x660F3839,
361         PMINUD = 0x660F383B,
362         PMINUW = 0x660F383A,
363         PMOVSXBW = 0x660F3820,
364         PMOVSXBD = 0x660F3821,
365         PMOVSXBQ = 0x660F3822,
366         PMOVSXWD = 0x660F3823,
367         PMOVSXWQ = 0x660F3824,
368         PMOVSXDQ = 0x660F3825,
369         PMOVZXBW = 0x660F3830,
370         PMOVZXBD = 0x660F3831,
371         PMOVZXBQ = 0x660F3832,
372         PMOVZXWD = 0x660F3833,
373         PMOVZXWQ = 0x660F3834,
374         PMOVZXDQ = 0x660F3835,
375         PMULDQ   = 0x660F3828,
376         PMULLD   = 0x660F3840,
377         PTEST    = 0x660F3817,
378 
379         ROUNDPD = 0x660F3A09,
380         ROUNDPS = 0x660F3A08,
381         ROUNDSD = 0x660F3A0B,
382         ROUNDSS = 0x660F3A0A,
383 
384         // SSE4.2
385         PCMPESTRI  = 0x660F3A61,
386         PCMPESTRM  = 0x660F3A60,
387         PCMPISTRI  = 0x660F3A63,
388         PCMPISTRM  = 0x660F3A62,
389         PCMPGTQ    = 0x660F3837,
390         //CRC32
391 
392         // SSE4a (AMD only)
393         // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
394 
395         // POPCNT and LZCNT (have their own CPUID bits)
396         POPCNT     = 0xF30FB8,
397         // LZCNT
398     }
399 
400     /**
401     * Generate two operand instruction with XMM 128 bit operands.
402     *
403     * This is a compiler magic function - it doesn't behave like
404     * regular D functions.
405     *
406     * Parameters:
407     *      opcode = any of the XMM opcodes; it must be a compile time constant
408     *      op1    = first operand
409     *      op2    = second operand
410     * Returns:
411     *      result of opcode
412     * Example:
413     ---
414     import core.simd;
415     import core.stdc.stdio;
416 
417     void main()
418     {
419         float4 A = [2.34f, -70000.0f, 0.00001f, 345.5f];
420         float4 R = A;
421         R = cast(float4) __simd(XMM.RCPSS, R, A);
422         printf("%g %g %g %g\n", R.array[0], R.array[1], R.array[2], R.array[3]);
423     }
424     ---
425     * Prints `0.427368 -70000 1e-05 345.5`.
426     * The use of the two operand form for `XMM.RCPSS` is necessary because the result of the instruction
427     * contains elements of both operands.
428     * Example:
429     ---
430     double[2] A = [56.0, -75.0];
431     double2 R = cast(double2) __simd(XMM.LODUPD, *cast(double2*)A.ptr);
432     ---
433     * The cast to `double2*` is necessary because the type of `*A.ptr` is `double`.
434     */
435     pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);
436 
437     ///
438     unittest
439     {
440         float4 a;
441         a = cast(float4)__simd(XMM.PXOR, a, a);
442     }
443 
444     /**
445     * Unary SIMD instructions.
446     */
447     pure @safe void16 __simd(XMM opcode, void16 op1);
448     pure @safe void16 __simd(XMM opcode, double d);   ///
449     pure @safe void16 __simd(XMM opcode, float f);    ///
450 
451     ///
452     unittest
453     {
454         float4 a;
455         a = cast(float4)__simd(XMM.LODSS, a);
456     }
457 
458     /****
459     * For instructions:
460     * CMPPD, CMPSS, CMPSD, CMPPS,
461     * PSHUFD, PSHUFHW, PSHUFLW,
462     * BLENDPD, BLENDPS, DPPD, DPPS,
463     * MPSADBW, PBLENDW,
464     * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
465     * Parameters:
466     *      opcode = any of the above XMM opcodes; it must be a compile time constant
467     *      op1    = first operand
468     *      op2    = second operand
469     *      imm8   = third operand; must be a compile time constant
470     * Returns:
471     *      result of opcode
472     */
473     pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);
474 
475     ///
476     unittest
477     {
478         float4 a;
479         a = cast(float4)__simd(XMM.CMPPD, a, a, 0x7A);
480     }
481 
482     /***
483     * For instructions with the imm8 version:
484     * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
485     * PSRLDQ, PSLLDQ
486     * Parameters:
487     *      opcode = any of the XMM opcodes; it must be a compile time constant
488     *      op1    = first operand
489     *      imm8   = second operand; must be a compile time constant
490     * Returns:
491     *      result of opcode
492     */
493     pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);
494 
495     ///
496     unittest
497     {
498         float4 a;
499         a = cast(float4) __simd_ib(XMM.PSRLQ, a, 0x7A);
500     }
501 
502     /*****
503     * For "store" operations of the form:
504     *    op1 op= op2
505     * such as MOVLPS.
506     * Returns:
507     *    op2
508     * These cannot be marked as pure, as semantic() doesn't check them.
509     */
510     @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
511     @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
512     @safe void16 __simd_sto(XMM opcode, float op1, void16 op2);  ///
513     @safe void16 __simd_sto(XMM opcode, void16 op1, long op2); ///
514 
515     ///
516     unittest
517     {
518         void16 a;
519         float f = 1;
520         double d = 1;
521 
522         cast(void)__simd_sto(XMM.STOUPS, a, a);
523         cast(void)__simd_sto(XMM.STOUPS, f, a);
524         cast(void)__simd_sto(XMM.STOUPS, d, a);
525     }
526 
527     /* The following use overloading to ensure correct typing.
528     * Compile with inlining on for best performance.
529     */
530 
531     pure @safe short8 pcmpeq()(short8 v1, short8 v2)
532     {
533         return cast(short8)__simd(XMM.PCMPEQW, v1, v2);
534     }
535 
536     pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
537     {
538         return cast(ushort8)__simd(XMM.PCMPEQW, v1, v2);
539     }
540 
541     /*********************
542     * Emit prefetch instruction.
543     * Params:
544     *    address = address to be prefetched
545     *    writeFetch = true for write fetch, false for read fetch
546     *    locality = 0..3 (0 meaning least local, 3 meaning most local)
547     * Note:
548     *    The Intel mappings are:
549     *    $(TABLE
550     *    $(THEAD writeFetch, locality, Instruction)
551     *    $(TROW false, 0, prefetchnta)
552     *    $(TROW false, 1, prefetch2)
553     *    $(TROW false, 2, prefetch1)
554     *    $(TROW false, 3, prefetch0)
555     *    $(TROW true, 0, prefetchw)
556     *    $(TROW true, 1, prefetchw)
557     *    $(TROW true, 2, prefetchw)
558     *    $(TROW true, 3, prefetchw)
559     *    )
560     */
561     void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
562     {
563         static if (writeFetch)
564             __prefetch(address, 4);
565         else static if (locality < 4)
566             __prefetch(address, 3 - locality);
567         else
568             static assert(0, "0..3 expected for locality");
569     }
570 
571     private void __prefetch(const(void*) address, ubyte encoding);
572 
573     /*************************************
574     * Load unaligned vector from address.
575     * This is a compiler intrinsic.
576     * Params:
577     *    p = pointer to vector
578     * Returns:
579     *    vector
580     */
581 
582     V loadUnaligned(V)(const V* p)
583         if (is(V == void16) ||
584             is(V == byte16) ||
585             is(V == ubyte16) ||
586             is(V == short8) ||
587             is(V == ushort8) ||
588             is(V == int4) ||
589             is(V == uint4) ||
590             is(V == long2) ||
591             is(V == ulong2) ||
592             is(V == double2) ||
593             is(V == float4))
594     {
595         pragma(inline, true);
596         static if (is(V == double2))
597             return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
598         else static if (is(V == float4))
599             return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
600         else
601             return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
602     }
603 
604     @system
605     unittest
606     {
607         // Memory to load into the vector:
608         // Should have enough data to test all 16-byte alignments, and still
609         // have room for a 16-byte vector
610         ubyte[32] data;
611         foreach (i; 0..data.length)
612         {
613             data[i] = cast(ubyte)i;
614         }
615 
616         // to test all alignments from 1 ~ 16
617         foreach (i; 0..16)
618         {
619             ubyte* d = &data[i];
620 
621             void test(T)()
622             {
623                 // load the data
624                 T v = loadUnaligned(cast(T*)d);
625 
626                 // check that the data was loaded correctly
627                 ubyte* ptrToV = cast(ubyte*)&v;
628                 foreach (j; 0..T.sizeof)
629                 {
630                     assert(ptrToV[j] == d[j]);
631                 }
632             }
633 
634             test!void16();
635             test!byte16();
636             test!ubyte16();
637             test!short8();
638             test!ushort8();
639             test!int4();
640             test!uint4();
641             test!long2();
642             test!ulong2();
643             test!double2();
644             test!float4();
645         }
646     }
647 
648     /*************************************
649     * Store vector to unaligned address.
650     * This is a compiler intrinsic.
651     * Params:
652     *    p = pointer to vector
653     *    value = value to store
654     * Returns:
655     *    value
656     */
657 
658     V storeUnaligned(V)(V* p, V value)
659         if (is(V == void16) ||
660             is(V == byte16) ||
661             is(V == ubyte16) ||
662             is(V == short8) ||
663             is(V == ushort8) ||
664             is(V == int4) ||
665             is(V == uint4) ||
666             is(V == long2) ||
667             is(V == ulong2) ||
668             is(V == double2) ||
669             is(V == float4))
670     {
671         pragma(inline, true);
672         static if (is(V == double2))
673             return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
674         else static if (is(V == float4))
675             return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
676         else
677             return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
678     }
679 
680     @system
681     unittest
682     {
683         // Memory to store the vector to:
684         // Should have enough data to test all 16-byte alignments, and still
685         // have room for a 16-byte vector
686         ubyte[32] data;
687 
688         // to test all alignments from 1 ~ 16
689         foreach (i; 0..16)
690         {
691             ubyte* d = &data[i];
692 
693             void test(T)()
694             {
695                 T v;
696 
697                 // populate v` with data
698                 ubyte* ptrToV = cast(ubyte*)&v;
699                 foreach (j; 0..T.sizeof)
700                 {
701                     ptrToV[j] = cast(ubyte)j;
702                 }
703 
704                 // store `v` to location pointed to by `d`
705                 storeUnaligned(cast(T*)d, v);
706 
707                 // check that the data was stored correctly
708                 foreach (j; 0..T.sizeof)
709                 {
710                     assert(ptrToV[j] == d[j]);
711                 }
712             }
713 
714             test!void16();
715             test!byte16();
716             test!ubyte16();
717             test!short8();
718             test!ushort8();
719             test!int4();
720             test!uint4();
721             test!long2();
722             test!ulong2();
723             test!double2();
724             test!float4();
725         }
726     }
727 }