1 // Written in the D programming language. 2 3 /** 4 * Builtin SIMD intrinsics 5 * 6 * Source: $(DRUNTIMESRC core/_simd.d) 7 * 8 * Copyright: Copyright Digital Mars 2012-2020 9 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 10 * Authors: $(HTTP digitalmars.com, Walter Bright), 11 * Source: $(DRUNTIMESRC core/_simd.d) 12 */ 13 14 module core.simd; 15 16 pure: 17 nothrow: 18 @safe: 19 @nogc: 20 21 /******************************* 22 * Create a vector type. 23 * 24 * Parameters: 25 * T = one of double[2], float[4], void[16], byte[16], ubyte[16], 26 * short[8], ushort[8], int[4], uint[4], long[2], ulong[2]. 27 * For 256 bit vectors, 28 * one of double[4], float[8], void[32], byte[32], ubyte[32], 29 * short[16], ushort[16], int[8], uint[8], long[4], ulong[4] 30 */ 31 32 template Vector(T) 33 { 34 /* __vector is compiler magic, hide it behind a template. 35 * The compiler will reject T's that don't work. 36 */ 37 alias __vector(T) Vector; 38 } 39 40 /* Handy aliases 41 */ 42 static if (is(Vector!(void[8]))) alias Vector!(void[8]) void8; /// 43 static if (is(Vector!(double[1]))) alias Vector!(double[1]) double1; /// 44 static if (is(Vector!(float[2]))) alias Vector!(float[2]) float2; /// 45 static if (is(Vector!(byte[8]))) alias Vector!(byte[8]) byte8; /// 46 static if (is(Vector!(ubyte[8]))) alias Vector!(ubyte[8]) ubyte8; /// 47 static if (is(Vector!(short[4]))) alias Vector!(short[4]) short4; /// 48 static if (is(Vector!(ushort[4]))) alias Vector!(ushort[4]) ushort4; /// 49 static if (is(Vector!(int[2]))) alias Vector!(int[2]) int2; /// 50 static if (is(Vector!(uint[2]))) alias Vector!(uint[2]) uint2; /// 51 static if (is(Vector!(long[1]))) alias Vector!(long[1]) long1; /// 52 static if (is(Vector!(ulong[1]))) alias Vector!(ulong[1]) ulong1; /// 53 54 static if (is(Vector!(void[16]))) alias Vector!(void[16]) void16; /// 55 static if (is(Vector!(double[2]))) alias Vector!(double[2]) double2; /// 56 static if (is(Vector!(float[4]))) alias Vector!(float[4]) float4; /// 57 static if (is(Vector!(byte[16]))) alias Vector!(byte[16]) byte16; /// 58 static if (is(Vector!(ubyte[16]))) alias Vector!(ubyte[16]) ubyte16; /// 59 static if (is(Vector!(short[8]))) alias Vector!(short[8]) short8; /// 60 static if (is(Vector!(ushort[8]))) alias Vector!(ushort[8]) ushort8; /// 61 static if (is(Vector!(int[4]))) alias Vector!(int[4]) int4; /// 62 static if (is(Vector!(uint[4]))) alias Vector!(uint[4]) uint4; /// 63 static if (is(Vector!(long[2]))) alias Vector!(long[2]) long2; /// 64 static if (is(Vector!(ulong[2]))) alias Vector!(ulong[2]) ulong2; /// 65 66 static if (is(Vector!(void[32]))) alias Vector!(void[32]) void32; /// 67 static if (is(Vector!(double[4]))) alias Vector!(double[4]) double4; /// 68 static if (is(Vector!(float[8]))) alias Vector!(float[8]) float8; /// 69 static if (is(Vector!(byte[32]))) alias Vector!(byte[32]) byte32; /// 70 static if (is(Vector!(ubyte[32]))) alias Vector!(ubyte[32]) ubyte32; /// 71 static if (is(Vector!(short[16]))) alias Vector!(short[16]) short16; /// 72 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16; /// 73 static if (is(Vector!(int[8]))) alias Vector!(int[8]) int8; /// 74 static if (is(Vector!(uint[8]))) alias Vector!(uint[8]) uint8; /// 75 static if (is(Vector!(long[4]))) alias Vector!(long[4]) long4; /// 76 static if (is(Vector!(ulong[4]))) alias Vector!(ulong[4]) ulong4; /// 77 78 static if (is(Vector!(void[64]))) alias Vector!(void[64]) void64; /// 79 static if (is(Vector!(double[8]))) alias Vector!(double[8]) double8; /// 80 static if (is(Vector!(float[16]))) alias Vector!(float[16]) float16; /// 81 static if (is(Vector!(byte[64]))) alias Vector!(byte[64]) byte64; /// 82 static if (is(Vector!(ubyte[64]))) alias Vector!(ubyte[64]) ubyte64; /// 83 static if (is(Vector!(short[32]))) alias Vector!(short[32]) short32; /// 84 static if (is(Vector!(ushort[32]))) alias Vector!(ushort[32]) ushort32; /// 85 static if (is(Vector!(int[16]))) alias Vector!(int[16]) int16; /// 86 static if (is(Vector!(uint[16]))) alias Vector!(uint[16]) uint16; /// 87 static if (is(Vector!(long[8]))) alias Vector!(long[8]) long8; /// 88 static if (is(Vector!(ulong[8]))) alias Vector!(ulong[8]) ulong8; /// 89 90 version (D_SIMD) 91 { 92 /** XMM opcodes that conform to the following: 93 * 94 * opcode xmm1,xmm2/mem 95 * 96 * and do not have side effects (i.e. do not write to memory). 97 */ 98 enum XMM 99 { 100 ADDSS = 0xF30F58, 101 ADDSD = 0xF20F58, 102 ADDPS = 0x000F58, 103 ADDPD = 0x660F58, 104 PADDB = 0x660FFC, 105 PADDW = 0x660FFD, 106 PADDD = 0x660FFE, 107 PADDQ = 0x660FD4, 108 109 SUBSS = 0xF30F5C, 110 SUBSD = 0xF20F5C, 111 SUBPS = 0x000F5C, 112 SUBPD = 0x660F5C, 113 PSUBB = 0x660FF8, 114 PSUBW = 0x660FF9, 115 PSUBD = 0x660FFA, 116 PSUBQ = 0x660FFB, 117 118 MULSS = 0xF30F59, 119 MULSD = 0xF20F59, 120 MULPS = 0x000F59, 121 MULPD = 0x660F59, 122 PMULLW = 0x660FD5, 123 124 DIVSS = 0xF30F5E, 125 DIVSD = 0xF20F5E, 126 DIVPS = 0x000F5E, 127 DIVPD = 0x660F5E, 128 129 PAND = 0x660FDB, 130 POR = 0x660FEB, 131 132 UCOMISS = 0x000F2E, 133 UCOMISD = 0x660F2E, 134 135 XORPS = 0x000F57, 136 XORPD = 0x660F57, 137 138 // Use STO and LOD instead of MOV to distinguish the direction 139 // (Destination is first operand, Source is second operand) 140 STOSS = 0xF30F11, /// MOVSS xmm1/m32, xmm2 141 STOSD = 0xF20F11, /// MOVSD xmm1/m64, xmm2 142 STOAPS = 0x000F29, /// MOVAPS xmm2/m128, xmm1 143 STOAPD = 0x660F29, /// MOVAPD xmm2/m128, xmm1 144 STODQA = 0x660F7F, /// MOVDQA xmm2/m128, xmm1 145 STOD = 0x660F7E, /// MOVD reg/mem64, xmm 66 0F 7E /r 146 STOQ = 0x660FD6, /// MOVQ xmm2/m64, xmm1 147 148 LODSS = 0xF30F10, /// MOVSS xmm1, xmm2/m32 149 LODSD = 0xF20F10, /// MOVSD xmm1, xmm2/m64 150 LODAPS = 0x000F28, /// MOVAPS xmm1, xmm2/m128 151 LODAPD = 0x660F28, /// MOVAPD xmm1, xmm2/m128 152 LODDQA = 0x660F6F, /// MOVDQA xmm1, xmm2/m128 153 LODD = 0x660F6E, /// MOVD xmm, reg/mem64 66 0F 6E /r 154 LODQ = 0xF30F7E, /// MOVQ xmm1, xmm2/m64 155 156 LODDQU = 0xF30F6F, /// MOVDQU xmm1, xmm2/mem128 F3 0F 6F /r 157 STODQU = 0xF30F7F, /// MOVDQU xmm1/mem128, xmm2 F3 0F 7F /r 158 MOVDQ2Q = 0xF20FD6, /// MOVDQ2Q mmx, xmm F2 0F D6 /r 159 MOVHLPS = 0x0F12, /// MOVHLPS xmm1, xmm2 0F 12 /r 160 LODHPD = 0x660F16, /// MOVHPD xmm1, m64 161 STOHPD = 0x660F17, /// MOVHPD mem64, xmm1 66 0F 17 /r 162 LODHPS = 0x0F16, /// MOVHPS xmm1, m64 163 STOHPS = 0x0F17, /// MOVHPS m64, xmm1 164 MOVLHPS = 0x0F16, /// MOVLHPS xmm1, xmm2 165 LODLPD = 0x660F12, /// MOVLPD xmm1, m64 166 STOLPD = 0x660F13, /// MOVLPD m64, xmm1 167 LODLPS = 0x0F12, /// MOVLPS xmm1, m64 168 STOLPS = 0x0F13, /// MOVLPS m64, xmm1 169 MOVMSKPD = 0x660F50, /// MOVMSKPD reg, xmm 170 MOVMSKPS = 0x0F50, /// MOVMSKPS reg, xmm 171 MOVNTDQ = 0x660FE7, /// MOVNTDQ m128, xmm1 172 MOVNTI = 0x0FC3, /// MOVNTI m32, r32 173 MOVNTPD = 0x660F2B, /// MOVNTPD m128, xmm1 174 MOVNTPS = 0x0F2B, /// MOVNTPS m128, xmm1 175 MOVNTQ = 0x0FE7, /// MOVNTQ m64, mm 176 MOVQ2DQ = 0xF30FD6, /// MOVQ2DQ 177 LODUPD = 0x660F10, /// MOVUPD xmm1, xmm2/m128 178 STOUPD = 0x660F11, /// MOVUPD xmm2/m128, xmm1 179 LODUPS = 0x0F10, /// MOVUPS xmm1, xmm2/m128 180 STOUPS = 0x0F11, /// MOVUPS xmm2/m128, xmm1 181 182 PACKSSDW = 0x660F6B, 183 PACKSSWB = 0x660F63, 184 PACKUSWB = 0x660F67, 185 PADDSB = 0x660FEC, 186 PADDSW = 0x660FED, 187 PADDUSB = 0x660FDC, 188 PADDUSW = 0x660FDD, 189 PANDN = 0x660FDF, 190 PCMPEQB = 0x660F74, 191 PCMPEQD = 0x660F76, 192 PCMPEQW = 0x660F75, 193 PCMPGTB = 0x660F64, 194 PCMPGTD = 0x660F66, 195 PCMPGTW = 0x660F65, 196 PMADDWD = 0x660FF5, 197 PSLLW = 0x660FF1, 198 PSLLD = 0x660FF2, 199 PSLLQ = 0x660FF3, 200 PSRAW = 0x660FE1, 201 PSRAD = 0x660FE2, 202 PSRLW = 0x660FD1, 203 PSRLD = 0x660FD2, 204 PSRLQ = 0x660FD3, 205 PSUBSB = 0x660FE8, 206 PSUBSW = 0x660FE9, 207 PSUBUSB = 0x660FD8, 208 PSUBUSW = 0x660FD9, 209 PUNPCKHBW = 0x660F68, 210 PUNPCKHDQ = 0x660F6A, 211 PUNPCKHWD = 0x660F69, 212 PUNPCKLBW = 0x660F60, 213 PUNPCKLDQ = 0x660F62, 214 PUNPCKLWD = 0x660F61, 215 PXOR = 0x660FEF, 216 ANDPD = 0x660F54, 217 ANDPS = 0x0F54, 218 ANDNPD = 0x660F55, 219 ANDNPS = 0x0F55, 220 CMPPS = 0x0FC2, 221 CMPPD = 0x660FC2, 222 CMPSD = 0xF20FC2, 223 CMPSS = 0xF30FC2, 224 COMISD = 0x660F2F, 225 COMISS = 0x0F2F, 226 CVTDQ2PD = 0xF30FE6, 227 CVTDQ2PS = 0x0F5B, 228 CVTPD2DQ = 0xF20FE6, 229 CVTPD2PI = 0x660F2D, 230 CVTPD2PS = 0x660F5A, 231 CVTPI2PD = 0x660F2A, 232 CVTPI2PS = 0x0F2A, 233 CVTPS2DQ = 0x660F5B, 234 CVTPS2PD = 0x0F5A, 235 CVTPS2PI = 0x0F2D, 236 CVTSD2SI = 0xF20F2D, 237 CVTSD2SS = 0xF20F5A, 238 CVTSI2SD = 0xF20F2A, 239 CVTSI2SS = 0xF30F2A, 240 CVTSS2SD = 0xF30F5A, 241 CVTSS2SI = 0xF30F2D, 242 CVTTPD2PI = 0x660F2C, 243 CVTTPD2DQ = 0x660FE6, 244 CVTTPS2DQ = 0xF30F5B, 245 CVTTPS2PI = 0x0F2C, 246 CVTTSD2SI = 0xF20F2C, 247 CVTTSS2SI = 0xF30F2C, 248 MASKMOVDQU = 0x660FF7, 249 MASKMOVQ = 0x0FF7, 250 MAXPD = 0x660F5F, 251 MAXPS = 0x0F5F, 252 MAXSD = 0xF20F5F, 253 MAXSS = 0xF30F5F, 254 MINPD = 0x660F5D, 255 MINPS = 0x0F5D, 256 MINSD = 0xF20F5D, 257 MINSS = 0xF30F5D, 258 ORPD = 0x660F56, 259 ORPS = 0x0F56, 260 PAVGB = 0x660FE0, 261 PAVGW = 0x660FE3, 262 PMAXSW = 0x660FEE, 263 //PINSRW = 0x660FC4, 264 PMAXUB = 0x660FDE, 265 PMINSW = 0x660FEA, 266 PMINUB = 0x660FDA, 267 //PMOVMSKB = 0x660FD7, 268 PMULHUW = 0x660FE4, 269 PMULHW = 0x660FE5, 270 PMULUDQ = 0x660FF4, 271 PSADBW = 0x660FF6, 272 PUNPCKHQDQ = 0x660F6D, 273 PUNPCKLQDQ = 0x660F6C, 274 RCPPS = 0x0F53, 275 RCPSS = 0xF30F53, 276 RSQRTPS = 0x0F52, 277 RSQRTSS = 0xF30F52, 278 SQRTPD = 0x660F51, 279 SHUFPD = 0x660FC6, 280 SHUFPS = 0x0FC6, 281 SQRTPS = 0x0F51, 282 SQRTSD = 0xF20F51, 283 SQRTSS = 0xF30F51, 284 UNPCKHPD = 0x660F15, 285 UNPCKHPS = 0x0F15, 286 UNPCKLPD = 0x660F14, 287 UNPCKLPS = 0x0F14, 288 289 PSHUFD = 0x660F70, 290 PSHUFHW = 0xF30F70, 291 PSHUFLW = 0xF20F70, 292 PSHUFW = 0x0F70, 293 PSLLDQ = 0x07660F73, 294 PSRLDQ = 0x03660F73, 295 296 //PREFETCH = 0x0F18, 297 298 // SSE3 Pentium 4 (Prescott) 299 300 ADDSUBPD = 0x660FD0, 301 ADDSUBPS = 0xF20FD0, 302 HADDPD = 0x660F7C, 303 HADDPS = 0xF20F7C, 304 HSUBPD = 0x660F7D, 305 HSUBPS = 0xF20F7D, 306 MOVDDUP = 0xF20F12, 307 MOVSHDUP = 0xF30F16, 308 MOVSLDUP = 0xF30F12, 309 LDDQU = 0xF20FF0, 310 MONITOR = 0x0F01C8, 311 MWAIT = 0x0F01C9, 312 313 // SSSE3 314 PALIGNR = 0x660F3A0F, 315 PHADDD = 0x660F3802, 316 PHADDW = 0x660F3801, 317 PHADDSW = 0x660F3803, 318 PABSB = 0x660F381C, 319 PABSD = 0x660F381E, 320 PABSW = 0x660F381D, 321 PSIGNB = 0x660F3808, 322 PSIGND = 0x660F380A, 323 PSIGNW = 0x660F3809, 324 PSHUFB = 0x660F3800, 325 PMADDUBSW = 0x660F3804, 326 PMULHRSW = 0x660F380B, 327 PHSUBD = 0x660F3806, 328 PHSUBW = 0x660F3805, 329 PHSUBSW = 0x660F3807, 330 331 // SSE4.1 332 333 BLENDPD = 0x660F3A0D, 334 BLENDPS = 0x660F3A0C, 335 BLENDVPD = 0x660F3815, 336 BLENDVPS = 0x660F3814, 337 DPPD = 0x660F3A41, 338 DPPS = 0x660F3A40, 339 EXTRACTPS = 0x660F3A17, 340 INSERTPS = 0x660F3A21, 341 MPSADBW = 0x660F3A42, 342 PBLENDVB = 0x660F3810, 343 PBLENDW = 0x660F3A0E, 344 PEXTRD = 0x660F3A16, 345 PEXTRQ = 0x660F3A16, 346 PINSRB = 0x660F3A20, 347 PINSRD = 0x660F3A22, 348 PINSRQ = 0x660F3A22, 349 350 MOVNTDQA = 0x660F382A, 351 PACKUSDW = 0x660F382B, 352 PCMPEQQ = 0x660F3829, 353 PEXTRB = 0x660F3A14, 354 PHMINPOSUW = 0x660F3841, 355 PMAXSB = 0x660F383C, 356 PMAXSD = 0x660F383D, 357 PMAXUD = 0x660F383F, 358 PMAXUW = 0x660F383E, 359 PMINSB = 0x660F3838, 360 PMINSD = 0x660F3839, 361 PMINUD = 0x660F383B, 362 PMINUW = 0x660F383A, 363 PMOVSXBW = 0x660F3820, 364 PMOVSXBD = 0x660F3821, 365 PMOVSXBQ = 0x660F3822, 366 PMOVSXWD = 0x660F3823, 367 PMOVSXWQ = 0x660F3824, 368 PMOVSXDQ = 0x660F3825, 369 PMOVZXBW = 0x660F3830, 370 PMOVZXBD = 0x660F3831, 371 PMOVZXBQ = 0x660F3832, 372 PMOVZXWD = 0x660F3833, 373 PMOVZXWQ = 0x660F3834, 374 PMOVZXDQ = 0x660F3835, 375 PMULDQ = 0x660F3828, 376 PMULLD = 0x660F3840, 377 PTEST = 0x660F3817, 378 379 ROUNDPD = 0x660F3A09, 380 ROUNDPS = 0x660F3A08, 381 ROUNDSD = 0x660F3A0B, 382 ROUNDSS = 0x660F3A0A, 383 384 // SSE4.2 385 PCMPESTRI = 0x660F3A61, 386 PCMPESTRM = 0x660F3A60, 387 PCMPISTRI = 0x660F3A63, 388 PCMPISTRM = 0x660F3A62, 389 PCMPGTQ = 0x660F3837, 390 //CRC32 391 392 // SSE4a (AMD only) 393 // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS 394 395 // POPCNT and LZCNT (have their own CPUID bits) 396 POPCNT = 0xF30FB8, 397 // LZCNT 398 } 399 400 /** 401 * Generate two operand instruction with XMM 128 bit operands. 402 * 403 * This is a compiler magic function - it doesn't behave like 404 * regular D functions. 405 * 406 * Parameters: 407 * opcode = any of the XMM opcodes; it must be a compile time constant 408 * op1 = first operand 409 * op2 = second operand 410 * Returns: 411 * result of opcode 412 * Example: 413 --- 414 import core.simd; 415 import core.stdc.stdio; 416 417 void main() 418 { 419 float4 A = [2.34f, -70000.0f, 0.00001f, 345.5f]; 420 float4 R = A; 421 R = cast(float4) __simd(XMM.RCPSS, R, A); 422 printf("%g %g %g %g\n", R.array[0], R.array[1], R.array[2], R.array[3]); 423 } 424 --- 425 * Prints `0.427368 -70000 1e-05 345.5`. 426 * The use of the two operand form for `XMM.RCPSS` is necessary because the result of the instruction 427 * contains elements of both operands. 428 * Example: 429 --- 430 double[2] A = [56.0, -75.0]; 431 double2 R = cast(double2) __simd(XMM.LODUPD, *cast(double2*)A.ptr); 432 --- 433 * The cast to `double2*` is necessary because the type of `*A.ptr` is `double`. 434 */ 435 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2); 436 437 /// 438 unittest 439 { 440 float4 a; 441 a = cast(float4)__simd(XMM.PXOR, a, a); 442 } 443 444 /** 445 * Unary SIMD instructions. 446 */ 447 pure @safe void16 __simd(XMM opcode, void16 op1); 448 pure @safe void16 __simd(XMM opcode, double d); /// 449 pure @safe void16 __simd(XMM opcode, float f); /// 450 451 /// 452 unittest 453 { 454 float4 a; 455 a = cast(float4)__simd(XMM.LODSS, a); 456 } 457 458 /**** 459 * For instructions: 460 * CMPPD, CMPSS, CMPSD, CMPPS, 461 * PSHUFD, PSHUFHW, PSHUFLW, 462 * BLENDPD, BLENDPS, DPPD, DPPS, 463 * MPSADBW, PBLENDW, 464 * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS 465 * Parameters: 466 * opcode = any of the above XMM opcodes; it must be a compile time constant 467 * op1 = first operand 468 * op2 = second operand 469 * imm8 = third operand; must be a compile time constant 470 * Returns: 471 * result of opcode 472 */ 473 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8); 474 475 /// 476 unittest 477 { 478 float4 a; 479 a = cast(float4)__simd(XMM.CMPPD, a, a, 0x7A); 480 } 481 482 /*** 483 * For instructions with the imm8 version: 484 * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW, 485 * PSRLDQ, PSLLDQ 486 * Parameters: 487 * opcode = any of the XMM opcodes; it must be a compile time constant 488 * op1 = first operand 489 * imm8 = second operand; must be a compile time constant 490 * Returns: 491 * result of opcode 492 */ 493 pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8); 494 495 /// 496 unittest 497 { 498 float4 a; 499 a = cast(float4) __simd_ib(XMM.PSRLQ, a, 0x7A); 500 } 501 502 /***** 503 * For "store" operations of the form: 504 * op1 op= op2 505 * such as MOVLPS. 506 * Returns: 507 * op2 508 * These cannot be marked as pure, as semantic() doesn't check them. 509 */ 510 @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2); 511 @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); /// 512 @safe void16 __simd_sto(XMM opcode, float op1, void16 op2); /// 513 @safe void16 __simd_sto(XMM opcode, void16 op1, long op2); /// 514 515 /// 516 unittest 517 { 518 void16 a; 519 float f = 1; 520 double d = 1; 521 522 cast(void)__simd_sto(XMM.STOUPS, a, a); 523 cast(void)__simd_sto(XMM.STOUPS, f, a); 524 cast(void)__simd_sto(XMM.STOUPS, d, a); 525 } 526 527 /* The following use overloading to ensure correct typing. 528 * Compile with inlining on for best performance. 529 */ 530 531 pure @safe short8 pcmpeq()(short8 v1, short8 v2) 532 { 533 return cast(short8)__simd(XMM.PCMPEQW, v1, v2); 534 } 535 536 pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2) 537 { 538 return cast(ushort8)__simd(XMM.PCMPEQW, v1, v2); 539 } 540 541 /********************* 542 * Emit prefetch instruction. 543 * Params: 544 * address = address to be prefetched 545 * writeFetch = true for write fetch, false for read fetch 546 * locality = 0..3 (0 meaning least local, 3 meaning most local) 547 * Note: 548 * The Intel mappings are: 549 * $(TABLE 550 * $(THEAD writeFetch, locality, Instruction) 551 * $(TROW false, 0, prefetchnta) 552 * $(TROW false, 1, prefetch2) 553 * $(TROW false, 2, prefetch1) 554 * $(TROW false, 3, prefetch0) 555 * $(TROW true, 0, prefetchw) 556 * $(TROW true, 1, prefetchw) 557 * $(TROW true, 2, prefetchw) 558 * $(TROW true, 3, prefetchw) 559 * ) 560 */ 561 void prefetch(bool writeFetch, ubyte locality)(const(void)* address) 562 { 563 static if (writeFetch) 564 __prefetch(address, 4); 565 else static if (locality < 4) 566 __prefetch(address, 3 - locality); 567 else 568 static assert(0, "0..3 expected for locality"); 569 } 570 571 private void __prefetch(const(void*) address, ubyte encoding); 572 573 /************************************* 574 * Load unaligned vector from address. 575 * This is a compiler intrinsic. 576 * Params: 577 * p = pointer to vector 578 * Returns: 579 * vector 580 */ 581 582 V loadUnaligned(V)(const V* p) 583 if (is(V == void16) || 584 is(V == byte16) || 585 is(V == ubyte16) || 586 is(V == short8) || 587 is(V == ushort8) || 588 is(V == int4) || 589 is(V == uint4) || 590 is(V == long2) || 591 is(V == ulong2) || 592 is(V == double2) || 593 is(V == float4)) 594 { 595 pragma(inline, true); 596 static if (is(V == double2)) 597 return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p); 598 else static if (is(V == float4)) 599 return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p); 600 else 601 return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p); 602 } 603 604 @system 605 unittest 606 { 607 // Memory to load into the vector: 608 // Should have enough data to test all 16-byte alignments, and still 609 // have room for a 16-byte vector 610 ubyte[32] data; 611 foreach (i; 0..data.length) 612 { 613 data[i] = cast(ubyte)i; 614 } 615 616 // to test all alignments from 1 ~ 16 617 foreach (i; 0..16) 618 { 619 ubyte* d = &data[i]; 620 621 void test(T)() 622 { 623 // load the data 624 T v = loadUnaligned(cast(T*)d); 625 626 // check that the data was loaded correctly 627 ubyte* ptrToV = cast(ubyte*)&v; 628 foreach (j; 0..T.sizeof) 629 { 630 assert(ptrToV[j] == d[j]); 631 } 632 } 633 634 test!void16(); 635 test!byte16(); 636 test!ubyte16(); 637 test!short8(); 638 test!ushort8(); 639 test!int4(); 640 test!uint4(); 641 test!long2(); 642 test!ulong2(); 643 test!double2(); 644 test!float4(); 645 } 646 } 647 648 /************************************* 649 * Store vector to unaligned address. 650 * This is a compiler intrinsic. 651 * Params: 652 * p = pointer to vector 653 * value = value to store 654 * Returns: 655 * value 656 */ 657 658 V storeUnaligned(V)(V* p, V value) 659 if (is(V == void16) || 660 is(V == byte16) || 661 is(V == ubyte16) || 662 is(V == short8) || 663 is(V == ushort8) || 664 is(V == int4) || 665 is(V == uint4) || 666 is(V == long2) || 667 is(V == ulong2) || 668 is(V == double2) || 669 is(V == float4)) 670 { 671 pragma(inline, true); 672 static if (is(V == double2)) 673 return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value); 674 else static if (is(V == float4)) 675 return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value); 676 else 677 return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value); 678 } 679 680 @system 681 unittest 682 { 683 // Memory to store the vector to: 684 // Should have enough data to test all 16-byte alignments, and still 685 // have room for a 16-byte vector 686 ubyte[32] data; 687 688 // to test all alignments from 1 ~ 16 689 foreach (i; 0..16) 690 { 691 ubyte* d = &data[i]; 692 693 void test(T)() 694 { 695 T v; 696 697 // populate v` with data 698 ubyte* ptrToV = cast(ubyte*)&v; 699 foreach (j; 0..T.sizeof) 700 { 701 ptrToV[j] = cast(ubyte)j; 702 } 703 704 // store `v` to location pointed to by `d` 705 storeUnaligned(cast(T*)d, v); 706 707 // check that the data was stored correctly 708 foreach (j; 0..T.sizeof) 709 { 710 assert(ptrToV[j] == d[j]); 711 } 712 } 713 714 test!void16(); 715 test!byte16(); 716 test!ubyte16(); 717 test!short8(); 718 test!ushort8(); 719 test!int4(); 720 test!uint4(); 721 test!long2(); 722 test!ulong2(); 723 test!double2(); 724 test!float4(); 725 } 726 } 727 }