1 /**
2  * XMM opcodes
3  *
4  * Compiler implementation of the
5  * $(LINK2 https://www.dlang.org, D programming language).
6  *
7  * Copyright:   Copyright (C) ?-2023 by The D Language Foundation, All Rights Reserved
8  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
9  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
10  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/xmm.d, backend/_xmm.d)
11  */
12 
13 module dmd.backend.xmm;
14 
15 // Online documentation: https://dlang.org/phobos/dmd_backend_xmm.html
16 
17 @safe:
18 
19 enum
20 {
21     ADDSS = 0xF30F58,           // ADDSS xmm1, xmm2/mem32 F3 0F 58 /r
22     ADDSD = 0xF20F58,           // ADDSD xmm1, xmm2/mem64 F2 0F 58 /r
23     ADDPS = 0x000F58,           // ADDPS xmm1, xmm2/mem128 0F 58 /r
24     ADDPD = 0x660F58,           // ADDPD xmm1, xmm2/mem128 66 0F 58 /r
25     PADDB = 0x660FFC,           // PADDB xmm1, xmm2/mem128 66 0F FC /r
26     PADDW = 0x660FFD,           // PADDW xmm1, xmm2/mem128 66 0F FD /r
27     PADDD = 0x660FFE,           // PADDD xmm1, xmm2/mem128 66 0F FE /r
28     PADDQ = 0x660FD4,           // PADDQ xmm1, xmm2/mem128 66 0F D4 /r
29 
30     SUBSS = 0xF30F5C,           // SUBSS xmm1, xmm2/mem32 F3 0F 5C /r
31     SUBSD = 0xF20F5C,           // SUBSD xmm1, xmm2/mem64 F2 0F 5C /r
32     SUBPS = 0x000F5C,           // SUBPS xmm1, xmm2/mem128 0F 5C /r
33     SUBPD = 0x660F5C,           // SUBPD xmm1, xmm2/mem128 66 0F 5C /r
34     PSUBB = 0x660FF8,           // PSUBB xmm1, xmm2/mem128 66 0F F8 /r
35     PSUBW = 0x660FF9,           // PSUBW xmm1, xmm2/mem128 66 0F F9 /r
36     PSUBD = 0x660FFA,           // PSUBD xmm1, xmm2/mem128 66 0F FA /r
37     PSUBQ = 0x660FFB,           // PSUBQ xmm1, xmm2/mem128 66 0F FB /r
38 
39     MULSS = 0xF30F59,           // MULSS  xmm1, xmm2/mem32 F3 0F 59 /r
40     MULSD = 0xF20F59,           // MULSD  xmm1, xmm2/mem64 F2 0F 59 /r
41     MULPS = 0x000F59,           // MULPS  xmm1, xmm2/mem128 0F 59 /r
42     MULPD = 0x660F59,           // MULPD  xmm1, xmm2/mem128 66 0F 59 /r
43     PMULLW = 0x660FD5,          // PMULLW xmm1, xmm2/mem128 66 0F D5 /r
44 
45     DIVSS = 0xF30F5E,           // DIVSS xmm1, xmm2/mem32 F3 0F 5E /r
46     DIVSD = 0xF20F5E,           // DIVSD xmm1, xmm2/mem64 F2 0F 5E /r
47     DIVPS = 0x000F5E,           // DIVPS xmm1, xmm2mem/128 0F 5E /r
48     DIVPD = 0x660F5E,           // DIVPD xmm1, xmm2/mem128 66 0F 5E /r
49 
50     PAND  = 0x660FDB,           // PAND xmm1, xmm2/mem128 66 0F DB /r
51     POR   = 0x660FEB,           // POR  xmm1, xmm2/mem128 66 0F EB /r
52 
53     UCOMISS = 0x000F2E,         // UCOMISS xmm1, xmm2/mem32 0F 2E /r
54     UCOMISD = 0x660F2E,         // UCOMISD xmm1, xmm2/mem64 66 0F 2E /r
55 
56     XORPS = 0x000F57,           // XORPS xmm1, xmm2/mem128 0F 57 /r
57     XORPD = 0x660F57,           // XORPD xmm1, xmm2/mem128 66 0F 57 /r
58 
59     // Use STO and LOD instead of MOV to distinguish the direction
60     STOSS  = 0xF30F11,          // MOVSS  xmm1/mem32, xmm2 F3 0F 11 /r
61     STOSD  = 0xF20F11,          // MOVSD  xmm1/mem64, xmm2 F2 0F 11 /r
62     STOAPS = 0x000F29,          // MOVAPS xmm1/mem128, xmm2 0F 29 /r
63     STOAPD = 0x660F29,          // MOVAPD xmm1/mem128, xmm2   66 0F 29 /r
64     STODQA = 0x660F7F,          // MOVDQA xmm1/mem128, xmm2 66 0F 7F /r
65     STOD   = 0x660F7E,          // MOVD   reg/mem64, xmm   66 0F 7E /r
66     STOQ   = 0x660FD6,          // MOVQ   xmm1/mem64, xmm2 66 0F D6 /
67 
68     LODSS  = 0xF30F10,          // MOVSS  xmm1, xmm2/mem32 F3 0F 10 /r
69     LODSD  = 0xF20F10,          // MOVSD  xmm1, xmm2/mem64 F2 0F 10 /r
70     LODAPS = 0x000F28,          // MOVAPS xmm1, xmm2/mem128 0F 28 /r
71     LODAPD = 0x660F28,          // MOVAPD xmm1, xmm2/mem128   66 0F 28 /r
72     LODDQA = 0x660F6F,          // MOVDQA xmm1, xmm2/mem128 66 0F 6F /r
73     LODD   = 0x660F6E,          // MOVD   xmm, reg/mem64   66 0F 6E /r
74     LODQ   = 0xF30F7E,          // MOVQ   xmm1, xmm2/mem64 F3 0F 7E /r
75 
76     LODDQU   = 0xF30F6F,        // MOVDQU xmm1, xmm2/mem128  F3 0F 6F /r
77     STODQU   = 0xF30F7F,        // MOVDQU xmm1/mem128, xmm2  F3 0F 7F /r
78     MOVDQ2Q  = 0xF20FD6,        // MOVDQ2Q mmx, xmm          F2 0F D6 /r
79     LODHPD   = 0x660F16,        // MOVHPD xmm, mem64         66 0F 16 /r
80     STOHPD   = 0x660F17,        // MOVHPD mem64, xmm         66 0F 17 /r
81     LODHPS   = 0x0F16,          // MOVHPS xmm, mem64         0F 16 /r
82     STOHPS   = 0x0F17,          // MOVHPS mem64, xmm         0F 17 /r
83     MOVLHPS  = 0x0F16,          // MOVLHPS xmm1, xmm2        0F 16 /r
84     LODLPD   = 0x660F12,        // MOVLPD xmm, mem64         66 0F 12 /r
85     STOLPD   = 0x660F13,        // MOVLPD mem64, xmm         66 0F 13 /r
86     MOVHLPS  = 0x0F12,          // MOVHLPS xmm1, xmm2        0F 12 /r
87     LODLPS   = 0x0F12,          // MOVLPS xmm, mem64         0F 12 /r
88     STOLPS   = 0x0F13,          // MOVLPS mem64, xmm         0F 13 /r
89     MOVMSKPD = 0x660F50,        // MOVMSKPD reg32, xmm 66 0F 50 /r
90     MOVMSKPS = 0x0F50,          // MOVMSKPS reg32, xmm 0F 50 /r
91     MOVNTDQ  = 0x660FE7,        // MOVNTDQ mem128, xmm 66 0F E7 /r
92     MOVNTI   = 0x0FC3,          // MOVNTI m32,r32 0F C3 /r
93                                 // MOVNTI m64,r64 0F C3 /r
94     MOVNTPD  = 0x660F2B,        // MOVNTPD mem128, xmm 66 0F 2B /r
95     MOVNTPS  = 0x0F2B,          // MOVNTPS mem128, xmm 0F 2B /r
96     MOVNTQ   = 0x0FE7,          // MOVNTQ m64, mmx 0F E7 /r
97     MOVQ2DQ  = 0xF30FD6,        // MOVQ2DQ xmm, mmx F3 0F D6 /r
98     LODUPD   = 0x660F10,        // MOVUPD xmm1, xmm2/mem128 66 0F 10 /r
99     STOUPD   = 0x660F11,        // MOVUPD xmm1/mem128, xmm2 66 0F 11 /r
100     LODUPS   = 0x0F10,          // MOVUPS xmm1, xmm2/mem128 0F 10 /r
101     STOUPS   = 0x0F11,          // MOVUPS xmm1/mem128, xmm2 0F 11 /r
102 
103     PACKSSDW = 0x660F6B,        // PACKSSDW xmm1, xmm2/mem128 66 0F 6B /r
104     PACKSSWB = 0x660F63,        // PACKSSWB xmm1, xmm2/mem128 66 0F 63 /r
105     PACKUSWB = 0x660F67,        // PACKUSWB xmm1, xmm2/mem128 66 0F 67 /r
106     PADDSB = 0x660FEC,          // PADDSB xmm1, xmm2/mem128 66 0F EC /r
107     PADDSW = 0x660FED,          // PADDSW xmm1, xmm2/mem128 66 0F ED /r
108     PADDUSB = 0x660FDC,         // PADDUSB xmm1, xmm2/mem128 66 0F DC /r
109     PADDUSW = 0x660FDD,         // PADDUSW xmm1, xmm2/mem128 66 0F DD /r
110     PANDN = 0x660FDF,           // PANDN xmm1, xmm2/mem128 66 0F DF /r
111     PCMPEQB = 0x660F74,         // PCMPEQB xmm1, xmm2/mem128 66 0F 74 /r
112     PCMPEQD = 0x660F76,         // PCMPEQD xmm1, xmm2/mem128 66 0F 76 /r
113     PCMPEQW = 0x660F75,         // PCMPEQW xmm1, xmm2/mem128 66 0F 75 /r
114     PCMPGTB = 0x660F64,         // PCMPGTB xmm1, xmm2/mem128 66 0F 64 /r
115     PCMPGTD = 0x660F66,         // PCMPGTD xmm1, xmm2/mem128 66 0F 66 /r
116     PCMPGTW = 0x660F65,         // PCMPGTW xmm1, xmm2/mem128 66 0F 65 /r
117     PMADDWD = 0x660FF5,         // PMADDWD xmm1, xmm2/mem128 66 0F F5 /r
118     PSLLW = 0x660FF1,           // PSLLW xmm1, xmm2/mem128    66 0F F1 /r
119                                 // PSLLW xmm, imm8            66 0F 71 /6 ib
120     PSLLD = 0x660FF2,           // PSLLD xmm1, xmm2/mem128    66 0F F2 /r
121                                 // PSLLD xmm, imm8            66 0F 72 /6 ib
122     PSLLQ = 0x660FF3,           // PSLLQ xmm1, xmm2/mem128    66 0F F3 /r
123                                 // PSLLQ xmm, imm8            66 0F 73 /6 ib
124     PSRAW = 0x660FE1,           // PSRAW xmm1, xmm2/mem128    66 0F E1 /r
125                                 // PSRAW xmm, imm8            66 0F 71 /4 ib
126     PSRAD = 0x660FE2,           // PSRAD xmm1, xmm2/mem128    66 0F E2 /r
127                                 // PSRAD xmm, imm8            66 0F 72 /4 ib
128     PSRLW = 0x660FD1,           // PSRLW xmm1, xmm2/mem128    66 0F D1 /r
129                                 // PSRLW xmm, imm8            66 0F 71 /2 ib
130     PSRLD = 0x660FD2,           // PSRLD xmm1, xmm2/mem128    66 0F D2 /r
131                                 // PSRLD xmm, imm8            66 0F 72 /2 ib
132     PSRLQ = 0x660FD3,           // PSRLQ xmm1, xmm2/mem128    66 0F D3 /r
133                                 // PSRLQ xmm, imm8            66 0F 73 /2 ib
134     PSUBSB = 0x660FE8,          // PSUBSB xmm1, xmm2/mem128 66 0F E8 /r
135     PSUBSW = 0x660FE9,          // PSUBSW xmm1, xmm2/mem128 66 0F E9 /r
136     PSUBUSB = 0x660FD8,         // PSUBUSB xmm1, xmm2/mem128 66 0F D8 /r
137     PSUBUSW = 0x660FD9,         // PSUBUSW xmm1, xmm2/mem128 66 0F D9 /r
138     PUNPCKHBW = 0x660F68,       // PUNPCKHBW xmm1, xmm2/mem128 66 0F 68 /r
139     PUNPCKHDQ = 0x660F6A,       // PUNPCKHDQ xmm1, xmm2/mem128 66 0F 6A /r
140     PUNPCKHWD = 0x660F69,       // PUNPCKHWD xmm1, xmm2/mem128 66 0F 69 /r
141     PUNPCKLBW = 0x660F60,       // PUNPCKLBW xmm1, xmm2/mem128 66 0F 60 /r
142     PUNPCKLDQ = 0x660F62,       // PUNPCKLDQ xmm1, xmm2/mem128 66 0F 62 /r
143     PUNPCKLWD = 0x660F61,       // PUNPCKLWD xmm1, xmm2/mem128 66 0F 61 /r
144     PXOR = 0x660FEF,            // PXOR xmm1, xmm2/mem128 66 0F EF /r
145     ANDPD = 0x660F54,           // ANDPD xmm1, xmm2/mem128 66 0F 54 /r
146     ANDPS = 0x0F54,             // ANDPS xmm1, xmm2/mem128 0F 54 /r
147     ANDNPD = 0x660F55,          // ANDNPD xmm1, xmm2/mem128 66 0F 55 /r
148     ANDNPS = 0x0F55,            // ANDNPS xmm1, xmm2/mem128 0F 55 /r
149     CMPPS = 0x0FC2,             // CMPPS xmm1, xmm2/mem128, imm8 0F C2 /r ib
150     CMPPD = 0x660FC2,           // CMPPD xmm1, xmm2/mem128, imm8 66 0F C2 /r ib
151     CMPSD = 0xF20FC2,           // CMPSD xmm1, xmm2/mem64, imm8 F2 0F C2 /r ib
152     CMPSS = 0xF30FC2,           // CMPSS xmm1, xmm2/mem32, imm8 F3 0F C2 /r ib
153     COMISD = 0x660F2F,          // COMISD xmm1, xmm2/mem64 66 0F 2F /r
154     COMISS = 0x0F2F,            // COMISS xmm1, xmm2/mem32 0F 2F /r
155     CVTDQ2PD = 0xF30FE6,        // CVTDQ2PD   xmm1, xmm2/mem64  F3 0F E6 /r
156     CVTDQ2PS = 0x0F5B,          // CVTDQ2PS   xmm1, xmm2/mem128 0F 5B /r
157     CVTPD2DQ = 0xF20FE6,        // CVTPD2DQ   xmm1, xmm2/mem128 F2 0F E6 /r
158     CVTPD2PI = 0x660F2D,        // CVTPD2PI   mmx, xmm2/mem128  66 0F 2D /r
159     CVTPD2PS = 0x660F5A,        // CVTPD2PS   xmm1, xmm2/mem128 66 0F 5A /r
160     CVTPI2PD = 0x660F2A,        // CVTPI2PD   xmm, mmx/mem64    66 0F 2A /r
161     CVTPI2PS = 0x0F2A,          // CVTPI2PS   xmm, mmx/mem64    0F 2A /r
162     CVTPS2DQ = 0x660F5B,        // CVTPS2DQ   xmm1, xmm2/mem128 66 0F 5B /r
163     CVTPS2PD = 0x0F5A,          // CVTPS2PD   xmm1, xmm2/mem64  0F 5A /r
164     CVTPS2PI = 0x0F2D,          // CVTPS2PI   mmx, xmm/mem64    0F 2D /r
165     CVTSD2SI = 0xF20F2D,        // CVTSD2SI   reg32, xmm/mem64  F2 0F 2D /r
166                                 // CVTSD2SI   reg64, xmm/mem64  F2 0F 2D /r
167     CVTSD2SS = 0xF20F5A,        // CVTSD2SS   xmm1, xmm2/mem64  F2 0F 5A /r
168     CVTSI2SD = 0xF20F2A,        // CVTSI2SD   xmm, reg/mem32    F2 0F 2A /r
169                                 // CVTSI2SD   xmm, reg/mem64    F2 0F 2A /r
170     CVTSI2SS = 0xF30F2A,        // CVTSI2SS   xmm, reg/mem32    F3 0F 2A /r
171                                 // CVTSI2SS   xmm, reg/mem64    F3 0F 2A /r
172     CVTSS2SD = 0xF30F5A,        // CVTSS2SD   xmm1, xmm2/mem32  F3 0F 5A /r
173     CVTSS2SI = 0xF30F2D,        // CVTSS2SI   reg32, xmm2/mem32 F3 0F 2D /r
174                                 // CVTSS2SI   reg64, xmm2/mem32 F3 0F 2D /r
175     CVTTPD2PI = 0x660F2C,       // CVTPD2PI   mmx, xmm/mem128   66 0F 2C /r
176     CVTTPD2DQ = 0x660FE6,       // CVTTPD2DQ  xmm1, xmm2/mem128 66 0F E6 /r
177     CVTTPS2DQ = 0xF30F5B,       // CVTTPS2DQ  xmm1, xmm2/mem128 F3 0F 5B /r
178     CVTTPS2PI = 0x0F2C,         // CVTTPS2PI  mmx xmm/mem64     0F 2C /r
179     CVTTSD2SI = 0xF20F2C,       // CVTTSD2SI  reg32, xmm/mem64  F2 0F 2C /r
180                                 // CVTTSD2SI  reg64, xmm/mem64  F2 0F 2C /r
181     CVTTSS2SI = 0xF30F2C,       // CVTTSS2SI  reg32, xmm/mem32  F3 0F 2C /r
182                                 // CVTTSS2SI  reg64, xmm/mem32  F3 0F 2C /r
183     MASKMOVDQU = 0x660FF7,      // MASKMOVDQU xmm1, xmm2        66 0F F7 /r
184     MASKMOVQ = 0x0FF7,          // MASKMOVQ   mm1,mm2           0F F7 /r
185     MAXPD = 0x660F5F,           // MAXPD      xmm1, xmm2/mem128 66 0F 5F /r
186     MAXPS = 0x0F5F,             // MAXPS      xmm1, xmm2/mem128 0F 5F /r
187     MAXSD = 0xF20F5F,           // MAXSD      xmm1, xmm2/mem64  F2 0F 5F /r
188     MAXSS = 0xF30F5F,           // MAXSS xmm1, xmm2/mem32 F3 0F 5F /r
189     MINPD = 0x660F5D,           // MINPD xmm1, xmm2/mem128 66 0F 5D /r
190     MINPS = 0x0F5D,             // MINPS xmm1, xmm2/mem128 0F 5D /r
191     MINSD = 0xF20F5D,           // MINSD xmm1, xmm2/mem64 F2 0F 5D /r
192     MINSS = 0xF30F5D,           // MINSS xmm1, xmm2/mem32   F3 0F 5D /r
193     ORPD = 0x660F56,            // ORPD xmm1, xmm2/mem128 66 0F 56 /r
194     ORPS = 0x0F56,              // ORPS xmm1, xmm2/mem128 0F 56 /r
195     PAVGB = 0x660FE0,           // PAVGB xmm1, xmm2/mem128 66 0F E0 /r
196     PAVGW = 0x660FE3,           // PAVGW xmm1, xmm2/mem128 66 0F E3 /r
197     PMAXSW = 0x660FEE,          // PMAXSW xmm1, xmm2/mem128 66 0F EE /
198     PINSRW = 0x660FC4,          // PINSRW xmm, reg32/mem16, imm8   66 0F C4 /r ib
199     PMAXUB = 0x660FDE,          // PMAXUB xmm1, xmm2/mem128 66 0F DE /r
200     PMINSW = 0x660FEA,          // PMINSW xmm1, xmm2/mem128 66 0F EA /r
201     PMINUB = 0x660FDA,          // PMINUB xmm1, xmm2/mem128 66 0F DA /r
202     PMOVMSKB = 0x660FD7,        // PMOVMSKB reg32, xmm   66 0F D7 /r
203     PMULHUW = 0x660FE4,         // PMULHUW xmm1, xmm2/mem128 66 0F E4 /r
204     PMULHW = 0x660FE5,          // PMULHW xmm1, xmm2/mem128 66 0F E5 /
205     PMULUDQ = 0x660FF4,         // PMULUDQ xmm1, xmm2/mem128 66 0F F4 /r
206     PSADBW = 0x660FF6,          // PSADBW xmm1, xmm2/mem128 66 0F F6 /r
207     PUNPCKHQDQ = 0x660F6D,      // PUNPCKHQDQ xmm1, xmm2/mem128 66 0F 6D /r
208     PUNPCKLQDQ = 0x660F6C,      // PUNPCKLQDQ xmm1, xmm2/mem128 66 0F 6C /r
209     RCPPS = 0x0F53,             // RCPPS xmm1, xmm2/mem128 0F 53 /r
210     RCPSS = 0xF30F53,           // RCPSS xmm1, xmm2/mem32 F3 0F 53 /r
211     RSQRTPS = 0x0F52,           // RSQRTPS xmm1, xmm2/mem128 0F 52 /r
212     RSQRTSS = 0xF30F52,         // RSQRTSS xmm1, xmm2/mem32 F3 0F 52 /r
213     SQRTPD = 0x660F51,          // SQRTPD xmm1, xmm2/mem128 66 0F 51 /r
214     SHUFPD = 0x660FC6,          // SHUFPD xmm1, xmm2/mem128, imm8 66 0F C6 /r ib
215     SHUFPS = 0x0FC6,            // SHUFPS xmm1, xmm2/mem128, imm8 0F C6 /r ib
216     SQRTPS = 0x0F51,            // SQRTPS xmm1, xmm2/mem128 0F 51 /r
217     SQRTSD = 0xF20F51,          // SQRTSD xmm1, xmm2/mem64 F2 0F 51 /r
218     SQRTSS = 0xF30F51,          // SQRTSS xmm1, xmm2/mem32 F3 0F 51 /r
219     UNPCKHPD = 0x660F15,        // UNPCKHPD xmm1, xmm2/mem12866 0F 15 /r
220     UNPCKHPS = 0x0F15,          // UNPCKHPS xmm1, xmm2/mem1280F 15 /r
221     UNPCKLPD = 0x660F14,        // UNPCKLPD xmm1, xmm2/mem128   66 0F 14 /r
222     UNPCKLPS = 0x0F14,          // UNPCKLPS xmm1, xmm2/mem1280F 14 /r
223 
224     PSHUFD = 0x660F70,          // PSHUFD  xmm1, xmm2/mem128, imm8 66 0F 70 /r ib
225     PSHUFHW = 0xF30F70,         // PSHUFHW xmm1, xmm2/mem128, imm8 F3 0F 70 /r ib
226     PSHUFLW = 0xF20F70,         // PSHUFLW xmm1, xmm2/mem128, imm8  F2 0F 70 /r ib
227     PSHUFW = 0x0F70,            // PSHUFW  mm1, mm2/mem64, imm8  0F 70 /r ib
228     PSLLDQ = 0x07660F73,        // PSLLDQ  xmm, imm8   66 0F 73 /7 ib
229     PSRLDQ = 0x03660F73,        // PSRLDQ  xmm, imm8   66 0F 73 /3 ib
230 
231     PREFETCH = 0x0F18,
232 
233     PEXTRW = 0x660FC5,          // PEXTRW  reg32, xmm, imm8 66 0F C5 /r ib
234     STMXCSR = 0x0FAE,           // STMXCSR mem32 0F AE /3
235 
236 // SSE3 Pentium 4 (Prescott)
237 
238     ADDSUBPD = 0x660FD0,        // ADDSUBPD xmm1, xmm2/m128
239     ADDSUBPS = 0xF20FD0,
240     HADDPD   = 0x660F7C,
241     HADDPS   = 0xF20F7C,
242     HSUBPD   = 0x660F7D,
243     HSUBPS   = 0xF20F7D,
244     MOVDDUP  = 0xF20F12,
245     MOVSHDUP = 0xF30F16,
246     MOVSLDUP = 0xF30F12,
247     LDDQU    = 0xF20FF0,
248     MONITOR  = 0x0F01C8,
249     MWAIT    = 0x0F01C9,
250 
251 // SSSE3
252     PALIGNR = 0x660F3A0F,
253     PHADDD = 0x660F3802,
254     PHADDW = 0x660F3801,
255     PHADDSW = 0x660F3803,
256     PABSB = 0x660F381C,
257     PABSD = 0x660F381E,
258     PABSW = 0x660F381D,
259     PSIGNB = 0x660F3808,
260     PSIGND = 0x660F380A,
261     PSIGNW = 0x660F3809,
262     PSHUFB = 0x660F3800,
263     PMADDUBSW = 0x660F3804,
264     PMULHRSW = 0x660F380B,
265     PHSUBD = 0x660F3806,
266     PHSUBW = 0x660F3805,
267     PHSUBSW = 0x660F3807,
268 
269 // SSE4.1
270 // See Intel SSE4 Programming Reference
271 
272     BLENDPD   = 0x660F3A0D,     // 66 0F 3A 0D /r ib  BLENDPD  xmm1, xmm2/m128, imm8
273     BLENDPS   = 0x660F3A0C,     // 66 0F 3A 0C /r ib  BLENDPS  xmm1, xmm2/m128, imm8
274     BLENDVPD  = 0x660F3815,     // 66 0F 38 15 /r     BLENDVPD xmm1, xmm2/m128, <XMM0>
275     BLENDVPS  = 0x660F3814,     // 66 0F 38 14 /r     BLENDVPS xmm1, xmm2/m128, <XMM0>
276     DPPD      = 0x660F3A41,
277     DPPS      = 0x660F3A40,
278     EXTRACTPS = 0x660F3A17,
279     INSERTPS  = 0x660F3A21,
280     MPSADBW   = 0x660F3A42,
281     PBLENDVB  = 0x660F3810,
282     PBLENDW   = 0x660F3A0E,
283     PEXTRD    = 0x660F3A16,
284     PEXTRQ    = 0x660F3A16,
285     PINSRB    = 0x660F3A20,     // 66 0F 3A 20 /r ib PINSRB xmm1, r32/m8, imm8
286     PINSRD    = 0x660F3A22,
287     PINSRQ    = 0x660F3A22,
288 
289     MOVNTDQA = 0x660F382A,
290     PACKUSDW = 0x660F382B,
291     PCMPEQQ = 0x660F3829,
292     PEXTRB = 0x660F3A14,        // 66 0F 3A 14 /r ib       PEXTRB r32/m8, xmm2, imm8
293                                 // 66 REX.W 0F 3A 14 /r ib PEXTRB r64/m8, xmm2, imm8
294     PHMINPOSUW = 0x660F3841,    // 66 0F 38 41 /r          PHMINPOSUW xmm1, xmm2/m128
295     PMAXSB = 0x660F383C,
296     PMAXSD = 0x660F383D,
297     PMAXUD = 0x660F383F,
298     PMAXUW = 0x660F383E,
299     PMINSB = 0x660F3838,
300     PMINSD = 0x660F3839,
301     PMINUD = 0x660F383B,
302     PMINUW = 0x660F383A,
303     PMOVSXBW = 0x660F3820,
304     PMOVSXBD = 0x660F3821,
305     PMOVSXBQ = 0x660F3822,
306     PMOVSXWD = 0x660F3823,
307     PMOVSXWQ = 0x660F3824,
308     PMOVSXDQ = 0x660F3825,
309     PMOVZXBW = 0x660F3830,
310     PMOVZXBD = 0x660F3831,
311     PMOVZXBQ = 0x660F3832,
312     PMOVZXWD = 0x660F3833,
313     PMOVZXWQ = 0x660F3834,
314     PMOVZXDQ = 0x660F3835,
315     PMULDQ   = 0x660F3828,
316     PMULLD   = 0x660F3840,
317     PTEST    = 0x660F3817,      // 66 0F 38 17 /r PTEST xmm1, xmm2/m128
318 
319     ROUNDPD = 0x660F3A09,       // 66 0F 3A 09 /r ib ROUNDPD xmm1, xmm2/m128, imm8
320     ROUNDPS = 0x660F3A08,
321     ROUNDSD = 0x660F3A0B,
322     ROUNDSS = 0x660F3A0A,
323 
324 // SSE4.2
325     PCMPESTRI  = 0x660F3A61,
326     PCMPESTRM  = 0x660F3A60,
327     PCMPISTRI  = 0x660F3A63,
328     PCMPISTRM  = 0x660F3A62,
329     PCMPGTQ    = 0x660F3837,
330     // CRC32
331 
332 // SSE4a (AMD only)
333     // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
334 
335 // POPCNT and LZCNT (have their own CPUID bits)
336     POPCNT     = 0xF30FB8,
337     // LZCNT
338 
339 // AVX
340     XGETBV = 0x0F01D0,
341     XSETBV = 0x0F01D1,
342     VBROADCASTSS   = 0x660F3818,
343     VBROADCASTSD   = 0x660F3819,
344     VBROADCASTF128 = 0x660F381A,
345     VINSERTF128    = 0x660F3A18,
346 
347 // AVX2
348     VPBROADCASTB   = 0x660F3878,
349     VPBROADCASTW   = 0x660F3879,
350     VPBROADCASTD   = 0x660F3858,
351     VPBROADCASTQ   = 0x660F3859,
352     VBROADCASTI128 = 0x660F385A,
353     VINSERTI128    = 0x660F3A38,
354 
355 // AES
356     AESENC     = 0x660F38DC,
357     AESENCLAST = 0x660F38DD,
358     AESDEC     = 0x660F38DE,
359     AESDECLAST = 0x660F38DF,
360     AESIMC     = 0x660F38DB,
361     AESKEYGENASSIST = 0x660F3ADF,
362 }