1 /**
2  * Instruction scheduler
3  *
4  * Compiler implementation of the
5  * $(LINK2 https://www.dlang.org, D programming language).
6  *
7  * Copyright:   Copyright (C) 1995-1998 by Symantec
8  *              Copyright (C) 2000-2023 by The D Language Foundation, All Rights Reserved
9  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
10  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
11  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgsched.c, backend/cgsched.d)
12  */
13 
14 module dmd.backend.cgsched;
15 
16 import core.stdc.stdio;
17 import core.stdc.stdlib;
18 import core.stdc.string;
19 
20 import dmd.backend.cc;
21 import dmd.backend.cdef;
22 import dmd.backend.cgen : gen1, gen2;
23 import dmd.backend.code;
24 import dmd.backend.code_x86;
25 import dmd.backend.dlist;
26 import dmd.backend.global;
27 import dmd.backend.mem;
28 import dmd.backend.ty;
29 import dmd.backend.barray;
30 
31 
32 nothrow:
33 @safe:
34 
35 // is32bitaddr works correctly only when x is 0 or 1.  This is
36 // true today for the current definition of I32, but if the definition
37 // of I32 changes, this macro will need to change as well
38 //
39 // Note: even for linux targets, CFaddrsize can be set by the inline
40 // assembler.
41 private bool is32bitaddr(bool x, uint Iflags) { return I64 || (x ^ ((Iflags & CFaddrsize) != 0)); }
42 
43 // If we use Pentium Pro scheduler
44 @trusted
45 private bool PRO() { return config.target_cpu >= TARGET_PentiumPro; }
46 
47 private enum FP : ubyte
48 {
49     none = 0,
50     fstp = 1,       /// FSTP mem
51     fld  = 2,       /// FLD mem
52     fop  = 3,       /// Fop ST0,mem or Fop ST0
53 }
54 
55 private enum CIFL : ubyte
56 {
57     arraybounds = 1,     /// this instruction is a jmp to array bounds
58     ea          = 2,     /// this instruction has a memory-referencing
59                              /// modregrm EA byte
60     nostage     = 4,     /// don't stage these instructions
61     push        = 8,     /// it's a push we can swap around
62 }
63 
64 // Struct where we gather information about an instruction
65 struct Cinfo
66 {
67     code *c;            // the instruction
68     ubyte pair;         // pairing information
69     ubyte sz;           // operand size
70     ubyte isz;          // instruction size
71 
72     // For floating point scheduling
73     ubyte fxch_pre;
74     ubyte fxch_post;
75     FP fp_op;           /// FPxxxx
76 
77     ubyte flags;         /// CIFLxxx
78 
79     uint r;             // read mask
80     uint w;             // write mask
81     uint a;             // registers used in addressing mode
82     ubyte reg;          // reg field of modregrm byte
83     ubyte uops;         // Pentium Pro micro-ops
84     uint sibmodrm;      // (sib << 8) + mod__rm byte
85     uint spadjust;      // if !=0, then amount ESP changes as a result of this
86                         // instruction being executed
87     int fpuadjust;      // if !=0, then amount FPU stack changes as a result
88                         // of this instruction being executed
89 
90     @trusted
91     nothrow void print()        // pretty-printer
92     {
93         Cinfo *ci = &this;
94 
95         if (ci == null)
96         {
97             printf("Cinfo 0\n");
98             return;
99         }
100 
101         printf("Cinfo %p:  c %p, pair %x, sz %d, isz %d, flags - ",
102                ci,c,pair,sz,isz);
103         if (ci.flags & CIFL.arraybounds)
104             printf("arraybounds,");
105         if (ci.flags & CIFL.ea)
106             printf("ea,");
107         if (ci.flags & CIFL.nostage)
108             printf("nostage,");
109         if (ci.flags & CIFL.push)
110             printf("push,");
111         if (ci.flags & ~(CIFL.arraybounds|CIFL.nostage|CIFL.push|CIFL.ea))
112             printf("bad flag,");
113         printf("\n\tr %x w %x a %x reg %x uops %x sibmodrm %x spadjust %d\n",
114                 cast(int)r,cast(int)w,cast(int)a,reg,uops,sibmodrm,cast(int)spadjust);
115         if (ci.fp_op)
116         {
117             __gshared const(char*)[3] fpops = ["fstp","fld","fop"];
118 
119             printf("\tfp_op %s, fxch_pre %x, fxch_post %x\n",
120                     fpops[fp_op-1],fxch_pre,fxch_post);
121         }
122     }
123 
124 }
125 
126 
127 /*****************************************
128  * Do Pentium optimizations.
129  * Input:
130  *      scratch         scratch registers we can use
131  */
132 
133 @trusted
134 private void cgsched_pentium(code **pc,regm_t scratch)
135 {
136     //printf("scratch = x%02x\n",scratch);
137     if (config.target_scheduler >= TARGET_80486)
138     {
139         if (!I64)
140             *pc = peephole(*pc,0);
141         if (I32)                        // forget about 16 bit code
142         {
143             if (config.target_cpu == TARGET_Pentium ||
144                 config.target_cpu == TARGET_PentiumMMX)
145                 *pc = simpleops(*pc,scratch);
146             *pc = schedule(*pc,0);
147         }
148     }
149 }
150 
151 /************************************
152  * Entry point
153  */
154 @trusted
155 public void cgsched_block(block* b)
156 {
157     if (config.flags4 & CFG4speed &&
158         config.target_cpu >= TARGET_Pentium &&
159         b.BC != BCasm)
160     {
161         regm_t scratch = allregs;
162 
163         scratch &= ~(b.Bregcon.used | b.Bregcon.params | mfuncreg);
164         scratch &= ~(b.Bregcon.immed.mval | b.Bregcon.cse.mval);
165         cgsched_pentium(&b.Bcode,scratch);
166         //printf("after schedule:\n"); WRcodlst(b.Bcode);
167     }
168 }
169 
170 enum
171 {
172     NP    = 0,       /// not pairable
173     PU    = 1,       /// pairable in U only, never executed in V
174     PV    = 2,       /// pairable in V only
175     UV    = (PU|PV), /// pairable in both U and V
176     PE    = 4,       /// register contention exception
177     PF    = 8,       /// flags contention exception
178     FX    = 0x10,    /// pairable with FXCH instruction
179 }
180 
181 extern (D) private immutable ubyte[256] pentcycl =
182 [
183         UV,UV,UV,UV,    UV,UV,NP,NP,    // 0
184         UV,UV,UV,UV,    UV,UV,NP,NP,    // 8
185         PU,PU,PU,PU,    PU,PU,NP,NP,    // 10
186         PU,PU,PU,PU,    PU,PU,NP,NP,    // 18
187         UV,UV,UV,UV,    UV,UV,NP,NP,    // 20
188         UV,UV,UV,UV,    UV,UV,NP,NP,    // 28
189         UV,UV,UV,UV,    UV,UV,NP,NP,    // 30
190         UV,UV,UV,UV,    UV,UV,NP,NP,    // 38
191 
192         UV,UV,UV,UV,    UV,UV,UV,UV,    // 40
193         UV,UV,UV,UV,    UV,UV,UV,UV,    // 48
194         PE|UV,PE|UV,PE|UV,PE|UV,        PE|UV,PE|UV,PE|UV,PE|UV, // 50  PUSH reg
195         PE|UV,PE|UV,PE|UV,PE|UV,        PE|UV,PE|UV,PE|UV,PE|UV, // 58  POP reg
196         NP,NP,NP,NP,    NP,NP,NP,NP,    // 60
197         PE|UV,NP,PE|UV,NP,      NP,NP,NP,NP,    // 68
198         PV|PF,PV|PF,PV|PF,PV|PF,        PV|PF,PV|PF,PV|PF,PV|PF,        // 70   Jcc rel8
199         PV|PF,PV|PF,PV|PF,PV|PF,        PV|PF,PV|PF,PV|PF,PV|PF,        // 78   Jcc rel8
200 
201         NP,NP,NP,NP,    NP,NP,NP,NP,    // 80
202         UV,UV,UV,UV,    NP,UV,NP,NP,    // 88
203         NP,NP,NP,NP,    NP,NP,NP,NP,    // 90
204         NP,NP,NP,NP,    NP,NP,NP,NP,    // 98
205         UV,UV,UV,UV,    NP,NP,NP,NP,    // A0
206         UV,UV,NP,NP,    NP,NP,NP,NP,    // A8
207         UV,UV,UV,UV,    UV,UV,UV,UV,    // B0
208         UV,UV,UV,UV,    UV,UV,UV,UV,    // B8
209 
210         NP,NP,NP,NP,    NP,NP,NP,NP,    // C0
211         NP,NP,NP,NP,    NP,NP,NP,NP,    // C8
212         PU,PU,NP,NP,    NP,NP,NP,NP,    // D0
213         FX,NP,FX,FX,    NP,NP,FX,NP,    // D8   all floating point
214         NP,NP,NP,NP,    NP,NP,NP,NP,    // E0
215         PE|PV,PV,NP,PV, NP,NP,NP,NP,    // E8
216         NP,NP,NP,NP,    NP,NP,NP,NP,    // F0
217         NP,NP,NP,NP,    NP,NP,NP,NP,    // F8
218 ];
219 
220 /********************************************
221  * For each opcode, determine read [0] and written [1] masks.
222  */
223 
224 enum
225 {
226     EA    = 0x100000,
227     R     = 0x200000,       /// register (reg of modregrm field)
228     N     = 0x400000,       /// other things modified, not swappable
229     B     = 0x800000,       /// it's a byte operation
230     C     = 0x1000000,      /// floating point flags
231     mMEM  = 0x2000000,      /// memory
232     S     = 0x4000000,      /// floating point stack
233     F     = 0x8000000,      /// flags
234 }
235 
236 extern (D) private immutable uint[2][256] oprw =
237 [
238       // 00
239       [ EA|R|B, F|EA|B ],       // ADD
240       [ EA|R,   F|EA   ],
241       [ EA|R|B, F|R|B  ],
242       [ EA|R,   F|R    ],
243       [ mAX,    F|mAX  ],
244       [ mAX,    F|mAX  ],
245       [ N,      N      ],       // PUSH ES
246       [ N,      N      ],       // POP  ES
247 
248       // 08
249       [ EA|R|B, F|EA|B ],       // OR
250       [ EA|R,   F|EA   ],
251       [ EA|R|B, F|R|B  ],
252       [ EA|R,   F|R    ],
253       [ mAX,    F|mAX  ],
254       [ mAX,    F|mAX  ],
255       [ N,      N      ],       // PUSH CS
256       [ N,      N      ],       // 2 byte escape
257 
258       // 10
259       [ F|EA|R|B,F|EA|B ],      // ADC
260       [ F|EA|R, F|EA    ],
261       [ F|EA|R|B,F|R|B  ],
262       [ F|EA|R, F|R     ],
263       [ F|mAX,  F|mAX   ],
264       [ F|mAX,  F|mAX   ],
265       [ N,      N       ],      // PUSH SS
266       [ N,      N       ],      // POP  SS
267 
268       // 18
269       [ F|EA|R|B,F|EA|B ],      // SBB
270       [ F|EA|R, F|EA    ],
271       [ F|EA|R|B,F|R|B  ],
272       [ F|EA|R, F|R     ],
273       [ F|mAX,  F|mAX   ],
274       [ F|mAX,  F|mAX   ],
275       [ N,      N       ],      // PUSH DS
276       [ N,      N       ],      // POP  DS
277 
278       // 20
279       [ EA|R|B, F|EA|B ],       // AND
280       [ EA|R,   F|EA   ],
281       [ EA|R|B, F|R|B  ],
282       [ EA|R,   F|R    ],
283       [ mAX,    F|mAX  ],
284       [ mAX,    F|mAX  ],
285       [ N,      N      ],       // SEG ES
286       [ F|mAX,  F|mAX  ],       // DAA
287 
288       // 28
289       [ EA|R|B, F|EA|B ],       // SUB
290       [ EA|R,   F|EA   ],
291       [ EA|R|B, F|R|B  ],
292       [ EA|R,   F|R    ],
293       [ mAX,    F|mAX  ],
294       [ mAX,    F|mAX  ],
295       [ N,      N      ],       // SEG CS
296       [ F|mAX,  F|mAX  ],       // DAS
297 
298       // 30
299       [ EA|R|B, F|EA|B ],       // XOR
300       [ EA|R,   F|EA   ],
301       [ EA|R|B, F|R|B  ],
302       [ EA|R,   F|R    ],
303       [ mAX,    F|mAX  ],
304       [ mAX,    F|mAX  ],
305       [ N,      N      ],       // SEG SS
306       [ F|mAX,  F|mAX  ],       // AAA
307 
308       // 38
309       [ EA|R|B, F ],            // CMP
310       [ EA|R,   F ],
311       [ EA|R|B, F ],
312       [ EA|R,   F ],
313       [ mAX,    F ],            // CMP AL,imm8
314       [ mAX,    F ],            // CMP EAX,imm16/32
315       [ N,      N ],            // SEG DS
316       [ N,      N ],            // AAS
317 
318       // 40
319       [ mAX,    F|mAX ],        // INC EAX
320       [ mCX,    F|mCX ],
321       [ mDX,    F|mDX ],
322       [ mBX,    F|mBX ],
323       [ mSP,    F|mSP ],
324       [ mBP,    F|mBP ],
325       [ mSI,    F|mSI ],
326       [ mDI,    F|mDI ],
327 
328       // 48
329       [ mAX,    F|mAX ],        // DEC EAX
330       [ mCX,    F|mCX ],
331       [ mDX,    F|mDX ],
332       [ mBX,    F|mBX ],
333       [ mSP,    F|mSP ],
334       [ mBP,    F|mBP ],
335       [ mSI,    F|mSI ],
336       [ mDI,    F|mDI ],
337 
338       // 50
339       [ mAX|mSP,        mSP|mMEM ],             // PUSH EAX
340       [ mCX|mSP,        mSP|mMEM ],
341       [ mDX|mSP,        mSP|mMEM ],
342       [ mBX|mSP,        mSP|mMEM ],
343       [ mSP|mSP,        mSP|mMEM ],
344       [ mBP|mSP,        mSP|mMEM ],
345       [ mSI|mSP,        mSP|mMEM ],
346       [ mDI|mSP,        mSP|mMEM ],
347 
348       // 58
349       [ mSP|mMEM,       mAX|mSP ],              // POP EAX
350       [ mSP|mMEM,       mCX|mSP ],
351       [ mSP|mMEM,       mDX|mSP ],
352       [ mSP|mMEM,       mBX|mSP ],
353       [ mSP|mMEM,       mSP|mSP ],
354       [ mSP|mMEM,       mBP|mSP ],
355       [ mSP|mMEM,       mSI|mSP ],
356       [ mSP|mMEM,       mDI|mSP ],
357 
358       // 60
359       [ N,      N ],            // PUSHA
360       [ N,      N ],            // POPA
361       [ N,      N ],            // BOUND Gv,Ma
362       [ N,      N ],            // ARPL  Ew,Rw
363       [ N,      N ],            // SEG FS
364       [ N,      N ],            // SEG GS
365       [ N,      N ],            // operand size prefix
366       [ N,      N ],            // address size prefix
367 
368       // 68
369       [ mSP,    mSP|mMEM ],     // PUSH immed16/32
370       [ EA,     F|R      ],     // IMUL Gv,Ev,lv
371       [ mSP,    mSP|mMEM ],     // PUSH immed8
372       [ EA,     F|R      ],     // IMUL Gv,Ev,lb
373       [ N,      N        ],     // INSB Yb,DX
374       [ N,      N        ],     // INSW/D Yv,DX
375       [ N,      N        ],     // OUTSB DX,Xb
376       [ N,      N        ],     // OUTSW/D DX,Xv
377 
378       // 70
379       [ F|N,    N ],
380       [ F|N,    N ],
381       [ F|N,    N ],
382       [ F|N,    N ],
383       [ F|N,    N ],
384       [ F|N,    N ],
385       [ F|N,    N ],
386       [ F|N,    N ],
387 
388       // 78
389       [ F|N,    N ],
390       [ F|N,    N ],
391       [ F|N,    N ],
392       [ F|N,    N ],
393       [ F|N,    N ],
394       [ F|N,    N ],
395       [ F|N,    N ],
396       [ F|N,    N ],
397 
398       // 80
399       [ N,      N    ],
400       [ N,      N    ],
401       [ N,      N    ],
402       [ N,      N    ],
403       [ EA|R,   F    ],         // TEST EA,r8
404       [ EA|R,   F    ],         // TEST EA,r16/32
405       [ EA|R,   EA|R ],         // XCHG EA,r8
406       [ EA|R,   EA|R ],         // XCHG EA,r16/32
407 
408       // 88
409       [ R|B,    EA|B ],         // MOV EA8,r8
410       [ R,      EA ],           // MOV EA,r16/32
411       [ EA|B,   R|B ],          // MOV r8,EA8
412       [ EA,     R ],            // MOV r16/32,EA
413       [ N,      N ],            // MOV EA,segreg
414       [ EA,     R ],            // LEA r16/32,EA
415       [ N,      N ],            // MOV segreg,EA
416       [ mSP|mMEM, EA|mSP ],     // POP mem16/32
417 
418       // 90
419       [ 0,              0       ],      // NOP
420       [ mAX|mCX,        mAX|mCX ],
421       [ mAX|mDX,        mAX|mDX ],
422       [ mAX|mBX,        mAX|mBX ],
423       [ mAX|mSP,        mAX|mSP ],
424       [ mAX|mBP,        mAX|mBP ],
425       [ mAX|mSI,        mAX|mSI ],
426       [ mAX|mDI,        mAX|mDI ],
427 
428       // 98
429       [ mAX,            mAX      ],     // CBW
430       [ mAX,            mDX      ],     // CWD
431       [ N,              N|F      ],     // CALL far ptr
432       [ N,              N        ],     // WAIT
433       [ F|mSP,          mSP|mMEM ],     // PUSHF
434       [ mSP|mMEM,       F|mSP    ],     // POPF
435       [ mAX,            F        ],     // SAHF
436       [ F,              mAX      ],     // LAHF
437 
438       // A0
439       [ mMEM,           mAX  ],         // MOV AL,moffs8
440       [ mMEM,           mAX  ],         // MOV EAX,moffs32
441       [ mAX,            mMEM ],         // MOV moffs8,AL
442       [ mAX,            mMEM ],         // MOV moffs32,EAX
443       [ N,              N    ],         // MOVSB
444       [ N,              N    ],         // MOVSW/D
445       [ N,              N    ],         // CMPSB
446       [ N,              N    ],         // CMPSW/D
447 
448       // A8
449       [ mAX,    F ],                    // TEST AL,imm8
450       [ mAX,    F ],                    // TEST AX,imm16
451       [ N,      N ],                    // STOSB
452       [ N,      N ],                    // STOSW/D
453       [ N,      N ],                    // LODSB
454       [ N,      N ],                    // LODSW/D
455       [ N,      N ],                    // SCASB
456       [ N,      N ],                    // SCASW/D
457 
458       // B0
459       [ 0,      mAX ],                  // MOV AL,imm8
460       [ 0,      mCX ],
461       [ 0,      mDX ],
462       [ 0,      mBX ],
463       [ 0,      mAX ],
464       [ 0,      mCX ],
465       [ 0,      mDX ],
466       [ 0,      mBX ],
467 
468       // B8
469       [ 0,      mAX ],                  // MOV AX,imm16
470       [ 0,      mCX ],
471       [ 0,      mDX ],
472       [ 0,      mBX ],
473       [ 0,      mSP ],
474       [ 0,      mBP ],
475       [ 0,      mSI ],
476       [ 0,      mDI ],
477 
478       // C0
479       [ EA,     F|EA ],         // Shift Eb,Ib
480       [ EA,     F|EA ],
481       [ N,      N    ],
482       [ N,      N    ],
483       [ N,      N    ],
484       [ N,      N    ],
485       [ 0,      EA|B ],         // MOV EA8,imm8
486       [ 0,      EA   ],         // MOV EA,imm16
487 
488       // C8
489       [ N,      N ],            // ENTER
490       [ N,      N ],            // LEAVE
491       [ N,      N ],            // RETF lw
492       [ N,      N ],            // RETF
493       [ N,      N ],            // INT 3
494       [ N,      N ],            // INT lb
495       [ N,      N ],            // INTO
496       [ N,      N ],            // IRET
497 
498       // D0
499       [ EA,             F|EA  ],        // Shift EA,1
500       [ EA,             F|EA  ],
501       [ EA|mCX,         F|EA  ],        // Shift EA,CL
502       [ EA|mCX,         F|EA  ],
503       [ mAX,            F|mAX ],        // AAM
504       [ mAX,            F|mAX ],        // AAD
505       [ N,              N     ],        // reserved
506       [ mAX|mBX|mMEM,   mAX   ],        // XLAT
507 
508       // D8
509       [ N,      N ],
510       [ N,      N ],
511       [ N,      N ],
512       [ N,      N ],
513       [ N,      N ],
514       [ N,      N ],
515       [ N,      N ],
516       [ N,      N ],
517 
518       // E0
519       [ F|mCX|N,mCX|N ],        // LOOPNE jb
520       [ F|mCX|N,mCX|N ],        // LOOPE  jb
521       [ mCX|N,  mCX|N ],        // LOOP   jb
522       [ mCX|N,  N     ],        // JCXZ   jb
523       [ N,      N     ],        // IN AL,lb
524       [ N,      N     ],        // IN EAX,lb
525       [ N,      N     ],        // OUT lb,AL
526       [ N,      N     ],        // OUT lb,EAX
527 
528       // E8
529       [ N,      N|F   ],        // CALL jv
530       [ N,      N     ],        // JMP Jv
531       [ N,      N     ],        // JMP Ab
532       [ N,      N     ],        // JMP jb
533       [ N|mDX,  N|mAX ],        // IN AL,DX
534       [ N|mDX,  N|mAX ],        // IN AX,DX
535       [ N|mAX|mDX,N   ],        // OUT DX,AL
536       [ N|mAX|mDX,N   ],        // OUT DX,AX
537 
538       // F0
539       [ N,      N ],            // LOCK
540       [ N,      N ],            // reserved
541       [ N,      N ],            // REPNE
542       [ N,      N ],            // REP,REPE
543       [ N,      N ],            // HLT
544       [ F,      F ],            // CMC
545       [ N,      N ],
546       [ N,      N ],
547 
548       // F8
549       [ 0,      F    ],         // CLC
550       [ 0,      F    ],         // STC
551       [ N,      N    ],         // CLI
552       [ N,      N    ],         // STI
553       [ N,      N    ],         // CLD
554       [ N,      N    ],         // STD
555       [ EA,     F|EA ],         // INC/DEC
556       [ N,      N    ],
557 ];
558 
559 /****************************************
560  * Same thing, but for groups.
561  */
562 
563 extern (D) private immutable uint[2][8][8] grprw =
564 [
565     [
566         // Grp 1
567       [ EA,     F|EA ],           // ADD
568       [ EA,     F|EA ],           // OR
569       [ F|EA,   F|EA ],           // ADC
570       [ F|EA,   F|EA ],           // SBB
571       [ EA,     F|EA ],           // AND
572       [ EA,     F|EA ],           // SUB
573       [ EA,     F|EA ],           // XOR
574       [ EA,     F    ],           // CMP
575     ],
576     [
577         // Grp 3
578       [ EA,     F ],              // TEST EA,imm
579       [ N,      N ],              // reserved
580       [ EA,     EA ],             // NOT
581       [ EA,     F|EA ],           // NEG
582       [ mAX|EA, F|mAX|mDX ],      // MUL
583       [ mAX|EA, F|mAX|mDX ],      // IMUL
584       [ mAX|mDX|EA, F|mAX|mDX ],  // DIV
585 
586         // Could generate an exception we want to catch
587         //mAX|mDX|EA|N,   F|mAX|mDX|N,    // IDIV
588 
589       [ mAX|mDX|EA,     F|mAX|mDX ],      // IDIV
590     ],
591     [
592         // Grp 5
593       [ EA,     F|EA ],           // INC Ev
594       [ EA,     F|EA ],           // DEC Ev
595       [ N|EA,   N ],              // CALL Ev
596       [ N|EA,   N ],              // CALL eP
597       [ N|EA,   N ],              // JMP Ev
598       [ N|EA,   N ],              // JMP Ep
599       [ mSP|EA, mSP|mMEM ],       // PUSH Ev
600       [ N,      N ],              // reserved
601     ],
602     [
603         // Grp 3, byte version
604       [ EA|B,   F ],              // TEST EA,imm
605       [ N,      N ],              // reserved
606       [ EA|B,   EA|B ],           // NOT
607       [ EA|B,   F|EA|B ],         // NEG
608       [ mAX|EA, F|mAX ],          // MUL
609       [ mAX|EA, F|mAX ],          // IMUL
610       [ mAX|EA, F|mAX ],          // DIV
611 
612         // Could generate an exception we want to catch
613         //mAX|EA|N,       F|mAX|N,        // IDIV
614 
615       [ mAX|EA, F|mAX ],          // IDIV
616     ]
617 ];
618 
619 /********************************************
620  * For floating point opcodes 0xD8..0xDF, with Irm < 0xC0.
621  *      [][][0] = read
622  *          [1] = write
623  */
624 
625 extern (D) private immutable uint[2][8][8] grpf1 =
626 [
627     [
628         // 0xD8
629       [ EA|S,   S|C ],    // FADD  float
630       [ EA|S,   S|C ],    // FMUL  float
631       [ EA|S,   C ],      // FCOM  float
632       [ EA|S,   S|C ],    // FCOMP float
633       [ EA|S,   S|C ],    // FSUB  float
634       [ EA|S,   S|C ],    // FSUBR float
635       [ EA|S,   S|C ],    // FDIV  float
636       [ EA|S,   S|C ],    // FDIVR float
637     ],
638     [
639         // 0xD9
640       [ EA,     S|C ],    // FLD  float
641       [ N,      N ],      //
642       [ S,      EA|C ],   // FST  float
643       [ S,      EA|S|C ], // FSTP float
644       [ N,      N ],      // FLDENV
645       [ N,      N ],      // FLDCW
646       [ N,      N ],      // FSTENV
647       [ N,      N ],      // FSTCW
648     ],
649     [
650         // 0xDA
651       [ EA|S,   S|C ],    // FIADD  long
652       [ EA|S,   S|C ],    // FIMUL  long
653       [ EA|S,   C ],      // FICOM  long
654       [ EA|S,   S|C ],    // FICOMP long
655       [ EA|S,   S|C ],    // FISUB  long
656       [ EA|S,   S|C ],    // FISUBR long
657       [ EA|S,   S|C ],    // FIDIV  long
658       [ EA|S,   S|C ],    // FIDIVR long
659     ],
660     [
661         // 0xDB
662       [ EA,     S|C ],    // FILD long
663       [ S,      EA|S|C ], // FISTTP int
664       [ S,      EA|C ],   // FIST long
665       [ S,      EA|S|C ], // FISTP long
666       [ N,      N ],      //
667       [ EA,     S|C ],    // FLD real80
668       [ N,      N ],      //
669       [ S,      EA|S|C ], // FSTP real80
670     ],
671     [
672         // 0xDC
673       [ EA|S,   S|C ],    // FADD  double
674       [ EA|S,   S|C ],    // FMUL  double
675       [ EA|S,   C ],      // FCOM  double
676       [ EA|S,   S|C ],    // FCOMP double
677       [ EA|S,   S|C ],    // FSUB  double
678       [ EA|S,   S|C ],    // FSUBR double
679       [ EA|S,   S|C ],    // FDIV  double
680       [ EA|S,   S|C ],    // FDIVR double
681     ],
682     [
683         // 0xDD
684       [ EA,     S|C ],    // FLD double
685       [ S,      EA|S|C ], // FISTTP long
686       [ S,      EA|C ],   // FST double
687       [ S,      EA|S|C ], // FSTP double
688       [ N,      N ],      // FRSTOR
689       [ N,      N ],      //
690       [ N,      N ],      // FSAVE
691       [ C,      EA ],     // FSTSW
692     ],
693     [
694         // 0xDE
695       [ EA|S,   S|C ],    // FIADD  short
696       [ EA|S,   S|C ],    // FIMUL  short
697       [ EA|S,   C ],      // FICOM  short
698       [ EA|S,   S|C ],    // FICOMP short
699       [ EA|S,   S|C ],    // FISUB  short
700       [ EA|S,   S|C ],    // FISUBR short
701       [ EA|S,   S|C ],    // FIDIV  short
702       [ EA|S,   S|C ],    // FIDIVR short
703     ],
704     [
705         // 0xDF
706       [ EA,     S|C ],    // FILD short
707       [ S,      EA|S|C ], // FISTTP short
708       [ S,      EA|C ],   // FIST short
709       [ S,      EA|S|C ], // FISTP short
710       [ EA,     S|C ],    // FBLD packed BCD
711       [ EA,     S|C ],    // FILD long long
712       [ S,      EA|S|C ], // FBSTP packed BCD
713       [ S,      EA|S|C ], // FISTP long long
714     ]
715 ];
716 
717 
718 /********************************************
719  * Micro-ops for floating point opcodes 0xD8..0xDF, with Irm < 0xC0.
720  */
721 
722 extern (D) private immutable ubyte[8][8] uopsgrpf1 =
723 [
724     [
725         // 0xD8
726         2,              // FADD  float
727         2,              // FMUL  float
728         2,              // FCOM  float
729         2,              // FCOMP float
730         2,              // FSUB  float
731         2,              // FSUBR float
732         2,              // FDIV  float
733         2,              // FDIVR float
734     ],
735     [
736         // 0xD9
737         1,              // FLD  float
738         0,              //
739         2,              // FST  float
740         2,              // FSTP float
741         5,              // FLDENV
742         3,              // FLDCW
743         5,              // FSTENV
744         5,              // FSTCW
745     ],
746     [
747         // 0xDA
748         5,              // FIADD  long
749         5,              // FIMUL  long
750         5,              // FICOM  long
751         5,              // FICOMP long
752         5,              // FISUB  long
753         5,              // FISUBR long
754         5,              // FIDIV  long
755         5,              // FIDIVR long
756     ],
757     [
758         // 0xDB
759         4,              // FILD long
760         0,              //
761         4,              // FIST long
762         4,              // FISTP long
763         0,              //
764         4,              // FLD real80
765         0,              //
766         5,              // FSTP real80
767     ],
768     [
769         // 0xDC
770         2,              // FADD  double
771         2,              // FMUL  double
772         2,              // FCOM  double
773         2,              // FCOMP double
774         2,              // FSUB  double
775         2,              // FSUBR double
776         2,              // FDIV  double
777         2,              // FDIVR double
778     ],
779     [
780         // 0xDD
781         1,              // FLD double
782         0,              //
783         2,              // FST double
784         2,              // FSTP double
785         5,              // FRSTOR
786         0,              //
787         5,              // FSAVE
788         5,              // FSTSW
789     ],
790     [
791         // 0xDE
792         5,              // FIADD  short
793         5,              // FIMUL  short
794         5,              // FICOM  short
795         5,              // FICOMP short
796         5,              // FISUB  short
797         5,              // FISUBR short
798         5,              // FIDIV  short
799         5,              // FIDIVR short
800     ],
801     [
802         // 0xDF
803         4,              // FILD short
804         0,              //
805         4,              // FIST short
806         4,              // FISTP short
807         5,              // FBLD packed BCD
808         4,              // FILD long long
809         5,              // FBSTP packed BCD
810         4,              // FISTP long long
811     ]
812 ];
813 
814 /**************************************************
815  * Determine number of micro-ops for Pentium Pro and Pentium II processors.
816  * 0 means special case,
817  * 5 means 'complex'
818  */
819 
820 extern (D) private immutable ubyte[256] insuops =
821 [       0,0,0,0,        1,1,4,5,                /* 00 */
822         0,0,0,0,        1,1,4,0,                /* 08 */
823         0,0,0,0,        2,2,4,5,                /* 10 */
824         0,0,0,0,        2,2,4,5,                /* 18 */
825         0,0,0,0,        1,1,0,1,                /* 20 */
826         0,0,0,0,        1,1,0,1,                /* 28 */
827         0,0,0,0,        1,1,0,1,                /* 30 */
828         0,0,0,0,        1,1,0,1,                /* 38 */
829         1,1,1,1,        1,1,1,1,                /* 40 */
830         1,1,1,1,        1,1,1,1,                /* 48 */
831         3,3,3,3,        3,3,3,3,                /* 50 */
832         2,2,2,2,        3,2,2,2,                /* 58 */
833         5,5,5,5,        0,0,0,0,                /* 60 */
834         3,3,0,0,        5,5,5,5,                /* 68 */
835         1,1,1,1,        1,1,1,1,                /* 70 */
836         1,1,1,1,        1,1,1,1,                /* 78 */
837         0,0,0,0,        0,0,0,0,                /* 80 */
838         0,0,0,0,        0,1,4,0,                /* 88 */
839         1,3,3,3,        3,3,3,3,                /* 90 */
840         1,1,5,0,        5,5,1,1,                /* 98 */
841         1,1,2,2,        5,5,5,5,                /* A0 */
842         1,1,3,3,        2,2,3,3,                /* A8 */
843         1,1,1,1,        1,1,1,1,                /* B0 */
844         1,1,1,1,        1,1,1,1,                /* B8 */
845         0,0,5,4,        0,0,0,0,                /* C0 */
846         5,3,5,5,        5,3,5,5,                /* C8 */
847         0,0,0,0,        4,3,0,2,                /* D0 */
848         0,0,0,0,        0,0,0,0,                /* D8 */
849         4,4,4,2,        5,5,5,5,                /* E0 */
850         4,1,5,1,        5,5,5,5,                /* E8 */
851         0,0,5,5,        5,1,0,0,                /* F0 */
852         1,1,5,5,        4,4,0,0,                /* F8 */
853 ];
854 
855 extern (D) private immutable ubyte[8] uopsx = [ 1,1,2,5,1,1,1,5 ];
856 
857 /************************************************
858  * Determine number of micro-ops for Pentium Pro and Pentium II processors.
859  * 5 means 'complex'.
860  * Doesn't currently handle:
861  *      floating point
862  *      MMX
863  *      0F opcodes
864  *      prefix bytes
865  */
866 
867 private int uops(code *c)
868 {   int n;
869     int op;
870     int op2;
871 
872     op = c.Iop & 0xFF;
873     if ((c.Iop & 0xFF00) == 0x0F00)
874         op = 0x0F;
875     n = insuops[op];
876     if (!n)                             // if special case
877     {   ubyte irm,mod,reg,rm;
878 
879         irm = c.Irm;
880         mod = (irm >> 6) & 3;
881         reg = (irm >> 3) & 7;
882         rm = irm & 7;
883 
884         switch (op)
885         {
886             case 0x10:
887             case 0x11:                  // ADC rm,r
888             case 0x18:
889             case 0x19:                  // SBB rm,r
890                 n = (mod == 3) ? 2 : 4;
891                 break;
892 
893             case 0x12:
894             case 0x13:                  // ADC r,rm
895             case 0x1A:
896             case 0x1B:                  // SBB r,rm
897                 n = (mod == 3) ? 2 : 3;
898                 break;
899 
900             case 0x00:
901             case 0x01:                  // ADD rm,r
902             case 0x08:
903             case 0x09:                  // OR rm,r
904             case 0x20:
905             case 0x21:                  // AND rm,r
906             case 0x28:
907             case 0x29:                  // SUB rm,r
908             case 0x30:
909             case 0x31:                  // XOR rm,r
910                 n = (mod == 3) ? 1 : 4;
911                 break;
912 
913             case 0x02:
914             case 0x03:                  // ADD r,rm
915             case 0x0A:
916             case 0x0B:                  // OR r,rm
917             case 0x22:
918             case 0x23:                  // AND r,rm
919             case 0x2A:
920             case 0x2B:                  // SUB r,rm
921             case 0x32:
922             case 0x33:                  // XOR r,rm
923             case 0x38:
924             case 0x39:                  // CMP rm,r
925             case 0x3A:
926             case 0x3B:                  // CMP r,rm
927             case 0x69:                  // IMUL rm,r,imm
928             case 0x6B:                  // IMUL rm,r,imm8
929             case 0x84:
930             case 0x85:                  // TEST rm,r
931                 n = (mod == 3) ? 1 : 2;
932                 break;
933 
934             case 0x80:
935             case 0x81:
936             case 0x82:
937             case 0x83:
938                 if (reg == 2 || reg == 3)       // ADC/SBB rm,imm
939                     n = (mod == 3) ? 2 : 4;
940                 else if (reg == 7)              // CMP rm,imm
941                     n = (mod == 3) ? 1 : 2;
942                 else
943                     n = (mod == 3) ? 1 : 4;
944                 break;
945 
946             case 0x86:
947             case 0x87:                          // XCHG rm,r
948                 n = (mod == 3) ? 3 : 5;
949                 break;
950 
951             case 0x88:
952             case 0x89:                          // MOV rm,r
953                 n = (mod == 3) ? 1 : 2;
954                 break;
955 
956             case 0x8A:
957             case 0x8B:                          // MOV r,rm
958                 n = 1;
959                 break;
960 
961             case 0x8C:                          // MOV Sreg,rm
962                 n = (mod == 3) ? 1 : 3;
963                 break;
964 
965             case 0x8F:
966                 if (reg == 0)                   // POP m
967                     n = 5;
968                 break;
969 
970             case 0xC6:
971             case 0xC7:
972                 if (reg == 0)                   // MOV rm,imm
973                     n = (mod == 3) ? 1 : 2;
974                 break;
975 
976             case 0xD0:
977             case 0xD1:
978                 if (reg == 2 || reg == 3)       // RCL/RCR rm,1
979                     n = (mod == 3) ? 2 : 4;
980                 else
981                     n = (mod == 3) ? 1 : 4;
982                 break;
983 
984             case 0xC0:
985             case 0xC1:                          // RCL/RCR rm,imm8
986             case 0xD2:
987             case 0xD3:
988                 if (reg == 2 || reg == 3)       // RCL/RCR rm,CL
989                     n = 5;
990                 else
991                     n = (mod == 3) ? 1 : 4;
992                 break;
993 
994             case 0xD8:
995             case 0xD9:
996             case 0xDA:
997             case 0xDB:
998             case 0xDC:
999             case 0xDD:
1000             case 0xDE:
1001             case 0xDF:
1002                 // Floating point opcodes
1003                 if (irm < 0xC0)
1004                 {   n = uopsgrpf1[op - 0xD8][reg];
1005                     break;
1006                 }
1007                 n = uopsx[op - 0xD8];
1008                 switch (op)
1009                 {
1010                     case 0xD9:
1011                         switch (irm)
1012                         {
1013                             case 0xE0:          // FCHS
1014                                 n = 3;
1015                                 break;
1016                             case 0xE8:
1017                             case 0xE9:
1018                             case 0xEA:
1019                             case 0xEB:
1020                             case 0xEC:
1021                             case 0xED:
1022                                 n = 2;
1023                                 break;
1024                             case 0xF0:
1025                             case 0xF1:
1026                             case 0xF2:
1027                             case 0xF3:
1028                             case 0xF4:
1029                             case 0xF5:
1030                             case 0xF8:
1031                             case 0xF9:
1032                             case 0xFB:
1033                             case 0xFC:
1034                             case 0xFD:
1035                             case 0xFE:
1036                             case 0xFF:
1037                                 n = 5;
1038                                 break;
1039 
1040                             default:
1041                                 break;
1042                         }
1043                         break;
1044                     case 0xDE:
1045                         if (irm == 0xD9)        // FCOMPP
1046                             n = 2;
1047                         break;
1048 
1049                     default:
1050                         break;
1051                 }
1052                 break;
1053 
1054             case 0xF6:
1055                 if (reg == 6 || reg == 7)       // DIV AL,rm8
1056                     n = (mod == 3) ? 3 : 4;
1057                 else if (reg == 4 || reg == 5 || reg == 0)      // MUL/IMUL/TEST rm8
1058                     n = (mod == 3) ? 1 : 2;
1059                 else if (reg == 2 || reg == 3)  // NOT/NEG rm
1060                     n = (mod == 3) ? 1 : 4;
1061                 break;
1062 
1063             case 0xF7:
1064                 if (reg == 6 || reg == 7)       // DIV EAX,rm
1065                     n = 4;
1066                 else if (reg == 4 || reg == 5)  // MUL/IMUL rm
1067                     n = (mod == 3) ? 3 : 4;
1068                 else if (reg == 2 || reg == 3)  // NOT/NEG rm
1069                     n = (mod == 3) ? 1 : 4;
1070                 break;
1071 
1072             case 0xFF:
1073                 if (reg == 2 || reg == 3 ||     // CALL rm, CALL m,rm
1074                     reg == 5)                   // JMP seg:offset
1075                     n = 5;
1076                 else if (reg == 4)
1077                     n = (mod == 3) ? 1 : 2;
1078                 else if (reg == 0 || reg == 1)  // INC/DEC rm
1079                     n = (mod == 3) ? 1 : 4;
1080                 else if (reg == 6)              // PUSH rm
1081                     n = (mod == 3) ? 3 : 4;
1082                 break;
1083 
1084             case 0x0F:
1085                 op2 = c.Iop & 0xFF;
1086                 if ((op2 & 0xF0) == 0x80)       // Jcc
1087                 {   n = 1;
1088                     break;
1089                 }
1090                 if ((op2 & 0xF0) == 0x90)       // SETcc
1091                 {   n = (mod == 3) ? 1 : 3;
1092                     break;
1093                 }
1094                 if (op2 == 0xB6 || op2 == 0xB7 ||       // MOVZX
1095                     op2 == 0xBE || op2 == 0xBF)         // MOVSX
1096                 {   n = 1;
1097                     break;
1098                 }
1099                 if (op2 == 0xAF)                        // IMUL r,m
1100                 {   n = (mod == 3) ? 1 : 2;
1101                     break;
1102                 }
1103                 break;
1104 
1105             default:
1106                 break;
1107         }
1108     }
1109     if (n == 0)
1110         n = 5;                                  // copout for now
1111     return n;
1112 }
1113 
1114 /******************************************
1115  * Determine pairing classification.
1116  * Don't deal with floating point, just assume they are all NP (Not Pairable).
1117  * Returns:
1118  *      NP,UV,PU,PV optionally OR'd with PE
1119  */
1120 
1121 private int pair_class(code *c)
1122 {   ubyte op;
1123     ubyte irm,mod,reg,rm;
1124     uint a32;
1125     int pc;
1126 
1127     // Of course, with Intel this is *never* simple, and Intel's
1128     // documentation is vague about the specifics.
1129 
1130     op = c.Iop & 0xFF;
1131     if ((c.Iop & 0xFF00) == 0x0F00)
1132         op = 0x0F;
1133     pc = pentcycl[op];
1134     a32 = I32;
1135     if (c.Iflags & CFaddrsize)
1136         a32 ^= 1;
1137     irm = c.Irm;
1138     mod = (irm >> 6) & 3;
1139     reg = (irm >> 3) & 7;
1140     rm = irm & 7;
1141     switch (op)
1142     {
1143         case 0x0F:                              // 2 byte opcode
1144             if ((c.Iop & 0xF0) == 0x80)        // if Jcc
1145                 pc = PV | PF;
1146             break;
1147 
1148         case 0x80:
1149         case 0x81:
1150         case 0x83:
1151             if (reg == 2 ||                     // ADC EA,immed
1152                 reg == 3)                       // SBB EA,immed
1153             {   pc = PU;
1154                 goto L2;
1155             }
1156             goto L1;                            // AND/OR/XOR/ADD/SUB/CMP EA,immed
1157 
1158         case 0x84:
1159         case 0x85:                              // TEST EA,reg
1160             if (mod == 3)                       // TEST reg,reg
1161                 pc = UV;
1162             break;
1163 
1164         case 0xC0:
1165         case 0xC1:
1166             if (reg >= 4)
1167                 pc = PU;
1168             break;
1169 
1170         case 0xC6:
1171         case 0xC7:
1172             if (reg == 0)                       // MOV EA,immed
1173             {
1174         L1:
1175                 pc = UV;
1176         L2:
1177                 // if EA contains a displacement then
1178                 // can't execute in V, or pair in U
1179                 switch (mod)
1180                 {   case 0:
1181                         if (a32)
1182                         {   if (rm == 5 ||
1183                                 (rm == 4 && (c.Isib & 7) == 5)
1184                                )
1185                                 pc = NP;
1186                         }
1187                         else if (rm == 6)
1188                             pc = NP;
1189                         break;
1190                     case 1:
1191                     case 2:
1192                         pc = NP;
1193                         break;
1194 
1195                     default:
1196                         break;
1197                 }
1198             }
1199             break;
1200 
1201         case 0xD9:
1202             if (irm < 0xC0)
1203             {
1204                 if (reg == 0)
1205                     pc = FX;
1206             }
1207             else if (irm < 0xC8)
1208                 pc = FX;
1209             else if (irm < 0xD0)
1210                 pc = PV;
1211             else
1212             {
1213                 switch (irm)
1214                 {
1215                     case 0xE0:
1216                     case 0xE1:
1217                     case 0xE4:
1218                         pc = FX;
1219                         break;
1220 
1221                     default:
1222                         break;
1223                 }
1224             }
1225             break;
1226 
1227         case 0xDB:
1228             if (irm < 0xC0 && (reg == 0 || reg == 5))
1229                 pc = FX;
1230             break;
1231 
1232         case 0xDD:
1233             if (irm < 0xC0)
1234             {
1235                 if (reg == 0)
1236                     pc = FX;
1237             }
1238             else if (irm >= 0xE0 && irm < 0xF0)
1239                 pc = FX;
1240             break;
1241 
1242         case 0xDF:
1243             if (irm < 0xC0 && (reg == 0 || reg == 5))
1244                 pc = FX;
1245             break;
1246 
1247         case 0xFE:
1248             if (reg == 0 || reg == 1)           // INC/DEC EA
1249                 pc = UV;
1250             break;
1251         case 0xFF:
1252             if (reg == 0 || reg == 1)           // INC/DEC EA
1253                 pc = UV;
1254             else if (reg == 2 || reg == 4)      // CALL/JMP near ptr EA
1255                 pc = PE|PV;
1256             else if (reg == 6 && mod == 3)      // PUSH reg
1257                 pc = PE | UV;
1258             break;
1259 
1260         default:
1261             break;
1262     }
1263     if (c.Iflags & CFPREFIX && pc == UV)       // if prefix byte
1264         pc = PU;
1265     return pc;
1266 }
1267 
1268 /******************************************
1269  * For an instruction, determine what is read
1270  * and what is written, and what is used for addressing.
1271  * Determine operand size if EA (larger is ok).
1272  */
1273 
1274 @trusted
1275 private void getinfo(out Cinfo ci,code *c)
1276 {
1277     if (!c)
1278         return;
1279     ci.c = c;
1280 
1281     if (PRO)
1282     {
1283         ci.uops = cast(ubyte)uops(c);
1284         ci.isz = cast(ubyte)calccodsize(c);
1285     }
1286     else
1287         ci.pair = cast(ubyte)pair_class(c);
1288 
1289     ubyte op;
1290     ubyte op2;
1291     ubyte irm,mod,reg,rm;
1292     uint a32;
1293     int pc;
1294     uint r,w;
1295     int sz = I32 ? 4 : 2;
1296 
1297     ci.r = 0;
1298     ci.w = 0;
1299     ci.a = 0;
1300     op = c.Iop & 0xFF;
1301     if ((c.Iop & 0xFF00) == 0x0F00)
1302         op = 0x0F;
1303     //printf("\tgetinfo %x, op %x \n",c,op);
1304     pc = pentcycl[op];
1305     a32 = I32;
1306     if (c.Iflags & CFaddrsize)
1307         a32 ^= 1;
1308     if (c.Iflags & CFopsize)
1309         sz ^= 2 | 4;
1310     irm = c.Irm;
1311     mod = (irm >> 6) & 3;
1312     reg = (irm >> 3) & 7;
1313     rm = irm & 7;
1314 
1315     r = oprw[op][0];
1316     w = oprw[op][1];
1317 
1318     switch (op)
1319     {
1320         case 0x50:
1321         case 0x51:
1322         case 0x52:
1323         case 0x53:
1324         case 0x55:
1325         case 0x56:
1326         case 0x57:                              // PUSH reg
1327             ci.flags |= CIFL.push;
1328             goto Lpush;
1329 
1330         case 0x54:                              // PUSH ESP
1331         case 0x6A:                              // PUSH imm8
1332         case 0x68:                              // PUSH imm
1333         case 0x0E:
1334         case 0x16:
1335         case 0x1E:
1336         case 0x06:
1337         case 0x9C:
1338         Lpush:
1339             ci.spadjust = -sz;
1340             ci.a |= mSP;
1341             break;
1342 
1343         case 0x58:
1344         case 0x59:
1345         case 0x5A:
1346         case 0x5B:
1347         case 0x5C:
1348         case 0x5D:
1349         case 0x5E:
1350         case 0x5F:                              // POP reg
1351         case 0x1F:
1352         case 0x07:
1353         case 0x17:
1354         case 0x9D:                              // POPF
1355         Lpop:
1356             ci.spadjust = sz;
1357             ci.a |= mSP;
1358             break;
1359 
1360         case 0x80:
1361             if (reg == 7)                       // CMP
1362                 c.Iflags |= CFpsw;
1363             r = B | grprw[0][reg][0];           // Grp 1 (byte)
1364             w = B | grprw[0][reg][1];
1365             break;
1366 
1367         case 0x81:
1368         case 0x83:
1369             if (reg == 7)                       // CMP
1370                 c.Iflags |= CFpsw;
1371             else if (irm == modregrm(3,0,SP))   // ADD ESP,imm
1372             {
1373                 assert(c.IFL2 == FLconst);
1374                 ci.spadjust = (op == 0x81) ? c.IEV2.Vint : cast(byte)c.IEV2.Vint;
1375             }
1376             else if (irm == modregrm(3,5,SP))   // SUB ESP,imm
1377             {
1378                 assert(c.IFL2 == FLconst);
1379                 ci.spadjust = (op == 0x81) ? -c.IEV2.Vint : -cast(int)cast(byte)c.IEV2.Vint;
1380             }
1381             r = grprw[0][reg][0];               // Grp 1
1382             w = grprw[0][reg][1];
1383             break;
1384 
1385         case 0x8F:
1386             if (reg == 0)                       // POP rm
1387                 goto Lpop;
1388             break;
1389 
1390         case 0xA0:
1391         case 0xA1:
1392         case 0xA2:
1393         case 0xA3:
1394             // Fake having an EA to simplify code in conflict()
1395             ci.flags |= CIFL.ea;
1396             ci.reg = 0;
1397             ci.sibmodrm = a32 ? modregrm(0,0,5) : modregrm(0,0,6);
1398             c.IFL1 = c.IFL2;
1399             c.IEV1 = c.IEV2;
1400             break;
1401 
1402         case 0xC2:
1403         case 0xC3:
1404         case 0xCA:
1405         case 0xCB:                              // RET
1406             ci.a |= mSP;
1407             break;
1408 
1409         case 0xE8:
1410             if (c.Iflags & CFclassinit)        // call to __j_classinit
1411             {   r = 0;
1412                 w = F;
1413 
1414 version (CLASSINIT2)
1415                 ci.pair = UV;                  // it is patched to CMP EAX,0
1416 else
1417                 ci.pair = NP;
1418 
1419             }
1420             break;
1421 
1422         case 0xF6:
1423             r = grprw[3][reg][0];               // Grp 3, byte version
1424             w = grprw[3][reg][1];
1425             break;
1426 
1427         case 0xF7:
1428             r = grprw[1][reg][0];               // Grp 3
1429             w = grprw[1][reg][1];
1430             break;
1431 
1432         case 0x0F:
1433             op2 = c.Iop & 0xFF;
1434             if ((op2 & 0xF0) == 0x80)           // if Jxx instructions
1435             {
1436                 ci.r = F | N;
1437                 ci.w = N;
1438                 goto Lret;
1439             }
1440             ci.r = N;
1441             ci.w = N;          // copout for now
1442             goto Lret;
1443 
1444         case 0xD7:                              // XLAT
1445             ci.a = mAX | mBX;
1446             break;
1447 
1448         case 0xFF:
1449             r = grprw[2][reg][0];               // Grp 5
1450             w = grprw[2][reg][1];
1451             if (reg == 6)                       // PUSH rm
1452                 goto Lpush;
1453             break;
1454 
1455         case 0x38:
1456         case 0x39:
1457         case 0x3A:
1458         case 0x3B:
1459         case 0x3C:                              // CMP AL,imm8
1460         case 0x3D:                              // CMP EAX,imm32
1461             // For CMP opcodes, always test for flags
1462             c.Iflags |= CFpsw;
1463             break;
1464 
1465         case ESCAPE:
1466             if (c.Iop == (ESCAPE | ESCadjfpu))
1467                 ci.fpuadjust = c.IEV1.Vint;
1468             break;
1469 
1470         case 0xD0:
1471         case 0xD1:
1472         case 0xD2:
1473         case 0xD3:
1474         case 0xC0:
1475         case 0xC1:
1476             if (reg == 2 || reg == 3)           // if RCL or RCR
1477                 c.Iflags |= CFpsw;             // always test for flags
1478             break;
1479 
1480         case 0xD8:
1481         case 0xD9:
1482         case 0xDA:
1483         case 0xDB:
1484         case 0xDC:
1485         case 0xDD:
1486         case 0xDE:
1487         case 0xDF:
1488             if (irm < 0xC0)
1489             {   r = grpf1[op - 0xD8][reg][0];
1490                 w = grpf1[op - 0xD8][reg][1];
1491                 switch (op)
1492                 {
1493                     case 0xD8:
1494                         if (reg == 3)           // if FCOMP
1495                             ci.fpuadjust = -1;
1496                         else
1497                             ci.fp_op = FP.fop;
1498                         break;
1499 
1500                     case 0xD9:
1501                         if (reg == 0)           // if FLD float
1502                         {   ci.fpuadjust = 1;
1503                             ci.fp_op = FP.fld;
1504                         }
1505                         else if (reg == 3)      // if FSTP float
1506                         {   ci.fpuadjust = -1;
1507                             ci.fp_op = FP.fstp;
1508                         }
1509                         else if (reg == 5 || reg == 7)
1510                             sz = 2;
1511                         else if (reg == 4 || reg == 6)
1512                             sz = 28;
1513                         break;
1514                     case 0xDA:
1515                         if (reg == 3)           // if FICOMP
1516                             ci.fpuadjust = -1;
1517                         break;
1518                     case 0xDB:
1519                         if (reg == 0 || reg == 5)
1520                         {   ci.fpuadjust = 1;
1521                             ci.fp_op = FP.fld;  // FILD / FLD long double
1522                         }
1523                         if (reg == 3 || reg == 7)
1524                             ci.fpuadjust = -1;
1525                         if (reg == 7)
1526                             ci.fp_op = FP.fstp; // FSTP long double
1527                         if (reg == 5 || reg == 7)
1528                             sz = 10;
1529                         break;
1530                     case 0xDC:
1531                         sz = 8;
1532                         if (reg == 3)           // if FCOMP
1533                             ci.fpuadjust = -1;
1534                         else
1535                             ci.fp_op = FP.fop;
1536                         break;
1537                     case 0xDD:
1538                         if (reg == 0)           // if FLD double
1539                         {   ci.fpuadjust = 1;
1540                             ci.fp_op = FP.fld;
1541                         }
1542                         if (reg == 3)           // if FSTP double
1543                         {   ci.fpuadjust = -1;
1544                             ci.fp_op = FP.fstp;
1545                         }
1546                         if (reg == 7)
1547                             sz = 2;
1548                         else if (reg == 4 || reg == 6)
1549                             sz = 108;
1550                         else
1551                             sz = 8;
1552                         break;
1553                     case 0xDE:
1554                         sz = 2;
1555                         if (reg == 3)           // if FICOMP
1556                             ci.fpuadjust = -1;
1557                         break;
1558                     case 0xDF:
1559                         sz = 2;
1560                         if (reg == 4 || reg == 6)
1561                             sz = 10;
1562                         else if (reg == 5 || reg == 7)
1563                             sz = 8;
1564                         if (reg == 0 || reg == 4 || reg == 5)
1565                             ci.fpuadjust = 1;
1566                         else if (reg == 3 || reg == 6 || reg == 7)
1567                             ci.fpuadjust = -1;
1568                         break;
1569 
1570                     default:
1571                         break;
1572                 }
1573                 break;
1574             }
1575             else if (op == 0xDE)
1576             {   ci.fpuadjust = -1;             // pop versions of Fop's
1577                 if (irm == 0xD9)
1578                     ci.fpuadjust = -2;         // FCOMPP
1579             }
1580 
1581             // Most floating point opcodes aren't staged, but are
1582             // sent right through, in order to make use of the large
1583             // latencies with floating point instructions.
1584             if (ci.fp_op == FP.fld ||
1585                 (op == 0xD9 && (irm & 0xF8) == 0xC0))
1586             { }                                // FLD ST(i)
1587             else
1588                 ci.flags |= CIFL.nostage;
1589 
1590             switch (op)
1591             {
1592                 case 0xD8:
1593                     r = S;
1594                     w = C;
1595                     if ((irm & ~7) == 0xD0)
1596                         w |= S;
1597                     break;
1598                 case 0xD9:
1599                     // FCHS or FABS or FSQRT
1600                     if (irm == 0xE0 || irm == 0xE1 || irm == 0xFA)
1601                         ci.fp_op = FP.fop;
1602                     r = S;
1603                     w = S|C;
1604                     break;
1605                 case 0xDA:
1606                     if (irm == 0xE9)    // FUCOMPP
1607                     {   r = S;
1608                         w = S|C;
1609                         break;
1610                     }
1611                     break;
1612                 case 0xDB:
1613                     if (irm == 0xE2)    // FCLEX
1614                     {   r = 0;
1615                         w = C;
1616                         break;
1617                     }
1618                     if (irm == 0xE3)    // FINIT
1619                     {   r = 0;
1620                         w = S|C;
1621                         break;
1622                     }
1623                     break;
1624                 case 0xDC:
1625                 case 0xDE:
1626                     if ((irm & 0xF0) != 0xD0)
1627                     {   r = S;
1628                         w = S|C;
1629                         break;
1630                     }
1631                     break;
1632                 case 0xDD:
1633                     // Not entirely correct, but conservative
1634                     r = S;
1635                     w = S|C;
1636                     break;
1637                 case 0xDF:
1638                     if (irm == 0xE0)    // FSTSW AX
1639                     {   r = C;
1640                         w = mAX;
1641                         break;
1642                     }
1643                     break;
1644 
1645                 default:
1646                     break;
1647             }
1648             break;
1649 
1650         default:
1651             //printf("\t\tNo special case\n");
1652             break;
1653     }
1654 
1655     if ((r | w) & B)                            // if byte operation
1656         sz = 1;                                 // operand size is 1
1657 
1658     ci.r = r & ~(R | EA);
1659     ci.w = w & ~(R | EA);
1660     if (r & R)
1661         ci.r |= mask((r & B) ? (reg & 3) : reg);
1662     if (w & R)
1663         ci.w |= mask((w & B) ? (reg & 3) : reg);
1664 
1665     // OR in bits for EA addressing mode
1666     if ((r | w) & EA)
1667     {   ubyte sib;
1668 
1669         sib = 0;
1670         switch (mod)
1671         {
1672             case 0:
1673                 if (a32)
1674                 {
1675                     if (rm == 4)
1676                     {
1677                         sib = c.Isib;
1678                         if ((sib & modregrm(0,7,0)) != modregrm(0,4,0))
1679                             ci.a |= mask((sib >> 3) & 7);      // index register
1680                         if ((sib & 7) != 5)
1681                             ci.a |= mask(sib & 7);             // base register
1682                     }
1683                     else if (rm != 5)
1684                         ci.a |= mask(rm);
1685                 }
1686                 else
1687                 {
1688                     immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,0,mBX];
1689                     ci.a |= ea16[rm];
1690                 }
1691                 goto Lmem;
1692 
1693             case 1:
1694             case 2:
1695                 if (a32)
1696                 {
1697                     if (rm == 4)
1698                     {
1699                         sib = c.Isib;
1700                         if ((sib & modregrm(0,7,0)) != modregrm(0,4,0))
1701                             ci.a |= mask((sib >> 3) & 7);      // index register
1702                         ci.a |= mask(sib & 7);                 // base register
1703                     }
1704                     else
1705                         ci.a |= mask(rm);
1706                 }
1707                 else
1708                 {
1709                     immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,mBP,mBX];
1710                     ci.a |= ea16[rm];
1711                 }
1712 
1713             Lmem:
1714                 if (r & EA)
1715                     ci.r |= mMEM;
1716                 if (w & EA)
1717                     ci.w |= mMEM;
1718                 ci.flags |= CIFL.ea;
1719                 break;
1720 
1721             case 3:
1722                 if (r & EA)
1723                     ci.r |= mask((r & B) ? (rm & 3) : rm);
1724                 if (w & EA)
1725                     ci.w |= mask((w & B) ? (rm & 3) : rm);
1726                 break;
1727 
1728             default:
1729                 assert(0);
1730         }
1731         // Adjust sibmodrm so that addressing modes can be compared simply
1732         irm &= modregrm(3,0,7);
1733         if (a32)
1734         {
1735             if (irm != modregrm(0,0,5))
1736             {
1737                 switch (mod)
1738                 {
1739                 case 0:
1740                     if ((sib & 7) != 5)     // if not disp32[index]
1741                     {
1742                         c.IFL1 = FLconst;
1743                         c.IEV1.Vpointer = 0;
1744                         irm |= 0x80;
1745                     }
1746                     break;
1747                 case 1:
1748                     c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer;
1749                     irm = modregrm(2, 0, rm);
1750                     break;
1751 
1752                 default:
1753                     break;
1754                 }
1755             }
1756         }
1757         else
1758         {
1759             if (irm != modregrm(0,0,6))
1760             {
1761                 switch (mod)
1762                 {
1763                     case 0:
1764                         c.IFL1 = FLconst;
1765                         c.IEV1.Vpointer = 0;
1766                         irm |= 0x80;
1767                         break;
1768                     case 1:
1769                         c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer;
1770                         irm = modregrm(2, 0, rm);
1771                         break;
1772 
1773                     default:
1774                         break;
1775                 }
1776             }
1777         }
1778 
1779         ci.r |= ci.a;
1780         ci.reg = reg;
1781         ci.sibmodrm = (sib << 8) | irm;
1782     }
1783 Lret:
1784     if (ci.w & mSP)                    // if stack pointer is modified
1785         ci.w |= mMEM;                  // then we are implicitly writing to memory
1786     if (op == LEA)                     // if LEA
1787         ci.r &= ~mMEM;                 // memory is not actually read
1788     ci.sz = cast(ubyte)sz;
1789 
1790     //printf("\t\t"); ci.print();
1791 }
1792 
1793 /******************************************
1794  * Determine if two instructions can pair.
1795  * Assume that in general, cu can pair in the U pipe and cv in the V.
1796  * Look for things like register contentions.
1797  * Input:
1798  *      cu      instruction for U pipe
1799  *      cv      instruction for V pipe
1800  * Returns:
1801  *      !=0 if they can pair
1802  */
1803 
1804 private int pair_test(const ref Cinfo cu, const ref Cinfo cv)
1805 {
1806     uint pcu;
1807     uint pcv;
1808     uint r1,w1;
1809     uint r2,w2;
1810     uint x;
1811 
1812     pcu = cu.pair;
1813     if (!(pcu & PU))
1814     {
1815         // See if pairs with FXCH and cv is FXCH
1816         if (pcu & FX && cv.c.Iop == 0xD9 && (cv.c.Irm & ~7) == 0xC8)
1817             goto Lpair;
1818         goto Lnopair;
1819     }
1820     pcv = cv.pair;
1821     if (!(pcv & PV))
1822         goto Lnopair;
1823 
1824     r1 = cu.r;
1825     w1 = cu.w;
1826     r2 = cv.r;
1827     w2 = cv.w;
1828 
1829     x = w1 & (r2 | w2) & ~(F|mMEM);     // register contention
1830     if (x &&                            // if register contention
1831         !(x == mSP && pcu & pcv & PE)   // and not exception
1832        )
1833         goto Lnopair;
1834 
1835     // Look for flags contention
1836     if (w1 & r2 & F && !(pcv & PF))
1837         goto Lnopair;
1838 
1839 Lpair:
1840     return 1;
1841 
1842 Lnopair:
1843     return 0;
1844 }
1845 
1846 /******************************************
1847  * Determine if two instructions have an AGI or register contention.
1848  * Returns:
1849  *      !=0 if they have an AGI
1850  */
1851 
1852 private int pair_agi(const ref Cinfo c1, const ref Cinfo c2) pure
1853 {
1854     uint x = c1.w & c2.a;
1855     return x && !(x == mSP && c1.pair & c2.pair & PE);
1856 }
1857 
1858 /********************************************
1859  * Determine if three instructions can decode simultaneously
1860  * in Pentium Pro and Pentium II.
1861  * Input:
1862  *      c0,c1,c2        candidates for decoders 0,1,2
1863  *                      c2 can be null
1864  * Returns:
1865  *      !=0 if they can decode simultaneously
1866  */
1867 
1868 private int triple_test(Cinfo *c0, Cinfo *c1, Cinfo *c2)
1869 {
1870     assert(c0);
1871     if (!c1)
1872         return 0;
1873     int c2isz = c2 ? c2.isz : 0;
1874     if (c0.isz > 7 || c1.isz > 7 || c2isz > 7 ||
1875         c0.isz + c1.isz + c2isz > 16)
1876         return 0;
1877 
1878     // 4-1-1 decode
1879     if (c1.uops > 1 ||
1880         (c2 && c2.uops > 1))
1881         return 0;
1882 
1883     return 1;
1884 }
1885 
1886 /********************************************
1887  * Get next instruction worth looking at for scheduling.
1888  * Returns:
1889  *      null    no more instructions
1890  */
1891 
1892 private code * cnext(code *c)
1893 {
1894     while (1)
1895     {
1896         c = code_next(c);
1897         if (!c)
1898             break;
1899         if (c.Iflags & (CFtarg | CFtarg2))
1900             break;
1901         if (!(c.Iop == NOP ||
1902               c.Iop == (ESCAPE | ESClinnum)))
1903             break;
1904     }
1905     return c;
1906 }
1907 
1908 /******************************************
1909  * Instruction scheduler.
1910  * Input:
1911  *      c               list of instructions to schedule
1912  *      scratch         scratch registers we can use
1913  * Returns:
1914  *      revised list of scheduled instructions
1915  */
1916 
1917 ///////////////////////////////////
1918 // Determine if c1 and c2 are swappable.
1919 // c1 comes before c2.
1920 // If they do not conflict
1921 //      return 0
1922 // If they do conflict
1923 //      return 0x100 + delay_clocks
1924 // Input:
1925 //      fpsched         if 1, then adjust fxch_pre and fxch_post to swap,
1926 //                      then return 0
1927 //                      if 2, then adjust ci1 as well as ci2
1928 
1929 @trusted
1930 private int conflict(Cinfo *ci1,Cinfo *ci2,int fpsched)
1931 {
1932     code *c1;
1933     code *c2;
1934     uint r1,w1,a1;
1935     uint r2,w2,a2;
1936     int sz1,sz2;
1937     int i = 0;
1938     int delay_clocks;
1939 
1940     c1 = ci1.c;
1941     c2 = ci2.c;
1942 
1943     //printf("conflict %x %x\n",c1,c2);
1944 
1945     r1 = ci1.r;
1946     w1 = ci1.w;
1947     a1 = ci1.a;
1948     sz1 = ci1.sz;
1949 
1950     r2 = ci2.r;
1951     w2 = ci2.w;
1952     a2 = ci2.a;
1953     sz2 = ci2.sz;
1954 
1955     //printf("r1 %lx w1 %lx a1 %lx sz1 %x\n",r1,w1,a1,sz1);
1956     //printf("r2 %lx w2 %lx a2 %lx sz2 %x\n",r2,w2,a2,sz2);
1957 
1958     if ((c1.Iflags | c2.Iflags) & (CFvolatile | CFvex))
1959         goto Lconflict;
1960 
1961     // Determine if we should handle FPU register conflicts separately
1962     //if (fpsched) printf("fp_op %d,%d:\n",ci1.fp_op,ci2.fp_op);
1963     if (fpsched && ci1.fp_op && ci2.fp_op)
1964     {
1965         w1 &= ~(S|C);
1966         r1 &= ~(S|C);
1967         w2 &= ~(S|C);
1968         r2 &= ~(S|C);
1969     }
1970     else
1971         fpsched = 0;
1972 
1973     if ((r1 | r2) & N)
1974     {
1975         goto Lconflict;
1976     }
1977 
1978 static if (0)
1979 {
1980     if (c1.Iop == 0xFF && c2.Iop == 0x8B)
1981     {   c1.print(); c2.print(); i = 1;
1982         printf("r1=%lx, w1=%lx, a1=%lx, sz1=%d, r2=%lx, w2=%lx, a2=%lx, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2);
1983     }
1984 }
1985 L1:
1986     if (w1 & r2 || (r1 | w1) & w2)
1987     {   ubyte ifl1,ifl2;
1988 
1989 if (i) printf("test\n");
1990 
1991 static if (0)
1992 {
1993 if (c1.IFL1 != c2.IFL1) printf("t1\n");
1994 if ((c1.Irm & modregrm(3,0,7)) != (c2.Irm & modregrm(3,0,7))) printf("t2\n");
1995 if ((issib(c1.Irm) && c1.Isib != c2.Isib)) printf("t3\n");
1996 if (c1.IEV1.Vpointer + sz1 <= c2.IEV1.Vpointer) printf("t4\n");
1997 if (c2.IEV1.Vpointer + sz2 <= c1.IEV1.Vpointer) printf("t5\n");
1998 }
1999 
2000         // make sure CFpsw is reliably set
2001         if (w1 & w2 & F &&              // if both instructions write to flags
2002             w1 != F &&
2003             w2 != F &&
2004             !((r1 | r2) & F) &&         // but neither instruction reads them
2005             !((c1.Iflags | c2.Iflags) & CFpsw))       // and we don't care about flags
2006         {
2007             w1 &= ~F;
2008             w2 &= ~F;                   // remove conflict
2009             goto L1;                    // and try again
2010         }
2011 
2012         // If other than the memory reference is a conflict
2013         if (w1 & r2 & ~mMEM || (r1 | w1) & w2 & ~mMEM)
2014         {   if (i) printf("\t1\n");
2015             if (i) printf("r1=%x, w1=%x, a1=%x, sz1=%d, r2=%x, w2=%x, a2=%x, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2);
2016             goto Lconflict;
2017         }
2018 
2019         // If referring to distinct types, then no dependency
2020         if (c1.Irex && c2.Irex && c1.Irex != c2.Irex)
2021             goto Lswap;
2022 
2023         ifl1 = c1.IFL1;
2024         ifl2 = c2.IFL1;
2025 
2026         // Special case: Allow indexed references using registers other than
2027         // ESP and EBP to be swapped with PUSH instructions
2028         if (((c1.Iop & ~7) == 0x50 ||          // PUSH reg
2029              c1.Iop == 0x6A ||                 // PUSH imm8
2030              c1.Iop == 0x68 ||                 // PUSH imm16/imm32
2031              (c1.Iop == 0xFF && ci1.reg == 6) // PUSH EA
2032             ) &&
2033             ci2.flags & CIFL.ea && !(a2 & mSP) &&
2034             !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0)
2035            )
2036         {
2037             if (c1.Iop == 0xFF)
2038             {
2039                 if (!(w2 & mMEM))
2040                     goto Lswap;
2041             }
2042             else
2043                 goto Lswap;
2044         }
2045 
2046         // Special case: Allow indexed references using registers other than
2047         // ESP and EBP to be swapped with PUSH instructions
2048         if (((c2.Iop & ~7) == 0x50 ||          // PUSH reg
2049              c2.Iop == 0x6A ||                 // PUSH imm8
2050              c2.Iop == 0x68 ||                 // PUSH imm16/imm32
2051              (c2.Iop == 0xFF && ci2.reg == 6) // PUSH EA
2052             ) &&
2053             ci1.flags & CIFL.ea && !(a1 & mSP) &&
2054             !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0)
2055            )
2056         {
2057             if (c2.Iop == 0xFF)
2058             {
2059                 if (!(w1 & mMEM))
2060                     goto Lswap;
2061             }
2062             else
2063                 goto Lswap;
2064         }
2065 
2066         // If not both an EA addressing mode, conflict
2067         if (!(ci1.flags & ci2.flags & CIFL.ea))
2068         {   if (i) printf("\t2\n");
2069             goto Lconflict;
2070         }
2071 
2072         if (ci1.sibmodrm == ci2.sibmodrm)
2073         {   if (ifl1 != ifl2)
2074                 goto Lswap;
2075             switch (ifl1)
2076             {
2077                 case FLconst:
2078                     if (c1.IEV1.Vint != c2.IEV1.Vint &&
2079                         (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint ||
2080                          c2.IEV1.Vint + sz2 <= c1.IEV1.Vint))
2081                         goto Lswap;
2082                     break;
2083                 case FLdatseg:
2084                     if (c1.IEV1.Vseg != c2.IEV1.Vseg ||
2085                         c1.IEV1.Vint + sz1 <= c2.IEV1.Vint ||
2086                         c2.IEV1.Vint + sz2 <= c1.IEV1.Vint)
2087                         goto Lswap;
2088                     break;
2089 
2090                 default:
2091                     break;
2092             }
2093         }
2094 
2095         if ((c1.Iflags | c2.Iflags) & CFunambig &&
2096             (ifl1 != ifl2 ||
2097              ci1.sibmodrm != ci2.sibmodrm ||
2098              (c1.IEV1.Vint != c2.IEV1.Vint &&
2099               (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint ||
2100                c2.IEV1.Vint + sz2 <= c1.IEV1.Vint)
2101              )
2102             )
2103            )
2104         {
2105             // Assume that [EBP] and [ESP] can point to the same location
2106             if (((a1 | a2) & (mBP | mSP)) == (mBP | mSP))
2107                 goto Lconflict;
2108             goto Lswap;
2109         }
2110 
2111         if (i) printf("\t3\n");
2112         goto Lconflict;
2113     }
2114 
2115 Lswap:
2116     if (fpsched)
2117     {
2118         //printf("\tfpsched %d,%d:\n",ci1.fp_op,ci2.fp_op);
2119         ubyte x1 = ci1.fxch_pre;
2120         ubyte y1 = ci1.fxch_post;
2121         ubyte x2 = ci2.fxch_pre;
2122         ubyte y2 = ci2.fxch_post;
2123 
2124         static uint X(uint a, uint b) { return (a << 8) | b; }
2125         switch (X(ci1.fp_op,ci2.fp_op))
2126         {
2127             case X(FP.fstp, FP.fld):
2128                 if (x1 || y1)
2129                     goto Lconflict;
2130                 if (x2)
2131                     goto Lconflict;
2132                 if (y2 == 0)
2133                     ci2.fxch_post++;
2134                 else if (y2 == 1)
2135                 {
2136                     ci2.fxch_pre++;
2137                     ci2.fxch_post++;
2138                 }
2139                 else
2140                 {
2141                     goto Lconflict;
2142                 }
2143                 break;
2144 
2145             case X(FP.fstp, FP.fop):
2146                 if (x1 || y1)
2147                     goto Lconflict;
2148                 ci2.fxch_pre++;
2149                 ci2.fxch_post++;
2150                 break;
2151 
2152             case X(FP.fop, FP.fop):
2153                 if (x1 == 0 && y1 == 1 && x2 == 0 && y2 == 0)
2154                 {   ci2.fxch_pre = 1;
2155                     ci2.fxch_post = 1;
2156                     break;
2157                 }
2158                 if (x1 == 0 && y1 == 0 && x2 == 1 && y2 == 1)
2159                     break;
2160                 goto Lconflict;
2161 
2162             case X(FP.fop, FP.fld):
2163                 if (x1 || y1)
2164                     goto Lconflict;
2165                 if (x2)
2166                     goto Lconflict;
2167                 if (y2)
2168                     break;
2169                 else if (fpsched == 2)
2170                     ci1.fxch_post = 1;
2171                 ci2.fxch_post = 1;
2172                 break;
2173 
2174             default:
2175                 goto Lconflict;
2176         }
2177 
2178         //printf("\tpre = %d, post = %d\n",ci2.fxch_pre,ci2.fxch_post);
2179     }
2180 
2181     //printf("w1 = x%x, w2 = x%x\n",w1,w2);
2182     if (i) printf("no conflict\n\n");
2183     return 0;
2184 
2185 Lconflict:
2186     //printf("r1=%x, w1=%x, r2=%x, w2=%x\n",r1,w1,r2,w2);
2187     delay_clocks = 0;
2188 
2189     // Determine if AGI
2190     if (!PRO && pair_agi(*ci1, *ci2))
2191         delay_clocks = 1;
2192 
2193     // Special delays for floating point
2194     if (fpsched)
2195     {   if (ci1.fp_op == FP.fld && ci2.fp_op == FP.fstp)
2196             delay_clocks = 1;
2197         else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fstp)
2198             delay_clocks = 3;
2199         else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fop)
2200             delay_clocks = 2;
2201     }
2202     else if (PRO)
2203     {
2204         // Look for partial register write stalls
2205         if (w1 & r2 & ALLREGS && sz1 < sz2)
2206             delay_clocks = 7;
2207     }
2208     else if ((w1 | r1) & (w2 | r2) & (C | S))
2209     {
2210         int op = c1.Iop;
2211         int reg = c1.Irm & modregrm(0,7,0);
2212         if (ci1.fp_op == FP.fld ||
2213             (op == 0xD9 && (c1.Irm & 0xF8) == 0xC0)
2214            )
2215         { }                             // FLD
2216         else if (op == 0xD9 && (c1.Irm & 0xF8) == 0xC8)
2217         { }                             // FXCH
2218         else if (c2.Iop == 0xD9 && (c2.Irm & 0xF8) == 0xC8)
2219         { }                             // FXCH
2220         else
2221             delay_clocks = 3;
2222     }
2223 
2224     if (i) printf("conflict %d\n\n",delay_clocks);
2225     return 0x100 + delay_clocks;
2226 }
2227 
2228 enum TBLMAX = 2*3*20;        // must be divisible by both 2 and 3
2229                              // (U,V pipe in Pentium, 3 decode units
2230                              //  in Pentium Pro)
2231 
2232 struct Schedule
2233 {
2234 nothrow:
2235     Cinfo*[TBLMAX] tbl;         // even numbers are U pipe, odd numbers are V
2236     int tblmax;                 // max number of slots used
2237 
2238     Cinfo[TBLMAX] cinfo;
2239     int cinfomax;
2240 
2241     Barray!(Cinfo*) stagelist;  // list of instructions in staging area
2242 
2243     int fpustackused;           // number of slots in FPU stack that are used
2244 
2245     @trusted
2246     void initialize(int fpustackinit)          // initialize scheduler
2247     {
2248         //printf("Schedule::initialize(fpustackinit = %d)\n", fpustackinit);
2249         memset(&this, 0, Schedule.sizeof);
2250         fpustackused = fpustackinit;
2251     }
2252 
2253     @trusted
2254     void dtor()
2255     {
2256         stagelist.dtor();
2257     }
2258 
2259 @trusted
2260 code **assemble(code **pc)  // reassemble scheduled instructions
2261 {
2262     code *c;
2263 
2264     debug
2265     if (debugs) printf("assemble:\n");
2266 
2267     assert(!*pc);
2268 
2269     // Try to insert the rest of the staged instructions
2270     size_t sli;
2271     for (sli = 0; sli < stagelist.length; ++sli)
2272     {
2273         Cinfo* ci = stagelist[sli];
2274         if (!ci)
2275             continue;
2276         if (!insert(ci))
2277             break;
2278     }
2279 
2280     // Get the instructions out of the schedule table
2281     assert(cast(uint)tblmax <= TBLMAX);
2282     for (int i = 0; i < tblmax; i++)
2283     {
2284         Cinfo* ci = tbl[i];
2285 
2286         debug
2287         if (debugs)
2288         {
2289             if (PRO)
2290             {   immutable char[4][3] tbl = [ "0  "," 1 ","  2" ];
2291 
2292                 if (ci)
2293                     printf("%s %d ",tbl[i - ((i / 3) * 3)].ptr,ci.uops);
2294                 else
2295                     printf("%s   ",tbl[i - ((i / 3) * 3)].ptr);
2296             }
2297             else
2298             {
2299                 printf((i & 1) ? " V " : "U  ");
2300             }
2301             if (ci)
2302                 ci.c.print();
2303             else
2304                 printf("\n");
2305         }
2306 
2307         if (!ci)
2308             continue;
2309         fpustackused += ci.fpuadjust;
2310         //printf("stage()1: fpustackused = %d\n", fpustackused);
2311         c = ci.c;
2312         if (i == 0)
2313             c.Iflags |= CFtarg;        // by definition, first is always a jump target
2314         else
2315             c.Iflags &= ~CFtarg;       // the rest are not
2316 
2317         // Put in any FXCH prefix
2318         if (ci.fxch_pre)
2319         {   code *cf;
2320             assert(i);
2321             cf = gen2(null,0xD9,0xC8 + ci.fxch_pre);
2322             *pc = cf;
2323             pc = &cf.next;
2324         }
2325 
2326         *pc = c;
2327         do
2328         {
2329             assert(*pc != code_next(*pc));
2330             pc = &(*pc).next;
2331         } while (*pc);
2332 
2333         // Put in any FXCH postfix
2334         if (ci.fxch_post)
2335         {
2336             for (int j = i + 1; j < tblmax; j++)
2337             {   if (tbl[j])
2338                 {   if (tbl[j].fxch_pre == ci.fxch_post)
2339                     {
2340                         tbl[j].fxch_pre = 0;           // they cancel each other out
2341                         goto L1;
2342                     }
2343                     break;
2344                 }
2345             }
2346             {   code *cf;
2347                 cf = gen2(null,0xD9,0xC8 + ci.fxch_post);
2348                 *pc = cf;
2349                 pc = &cf.next;
2350             }
2351         }
2352     L1:
2353     }
2354 
2355     // Just append any instructions left in the staging area
2356     foreach (ci; stagelist[sli .. stagelist.length])
2357     {
2358         if (!ci)
2359             continue;
2360 
2361         debug
2362         if (debugs) { printf("appending: "); ci.c.print(); }
2363 
2364         *pc = ci.c;
2365         do
2366         {
2367             pc = &(*pc).next;
2368 
2369         } while (*pc);
2370         fpustackused += ci.fpuadjust;
2371         //printf("stage()2: fpustackused = %d\n", fpustackused);
2372     }
2373     stagelist.setLength(0);
2374 
2375     return pc;
2376 }
2377 
2378 /******************************
2379  * Insert c into scheduling table.
2380  * Returns:
2381  *      0       could not be scheduled; have to start a new one
2382  */
2383 
2384 int insert(Cinfo *ci)
2385 {   code *c;
2386     int clocks;
2387     int i;
2388     int ic = 0;
2389     int imin;
2390     targ_size_t offset;
2391     targ_size_t vpointer;
2392     int movesp = 0;
2393     int reg2 = -1;              // avoid "may be uninitialized" warning
2394 
2395     //printf("insert "); ci.c.print();
2396     //printf("insert() %d\n", fpustackused);
2397     c = ci.c;
2398     //printf("\tc.Iop %x\n",c.Iop);
2399     vpointer = c.IEV1.Vpointer;
2400     assert(cast(uint)tblmax <= TBLMAX);
2401     if (tblmax == TBLMAX)               // if out of space
2402         goto Lnoinsert;
2403     if (tblmax == 0)                    // if table is empty
2404     {   // Just stuff it in the first slot
2405         i = tblmax;
2406         goto Linsert;
2407     }
2408     else if (c.Iflags & (CFtarg | CFtarg2))
2409         // Jump targets can only be first in the scheduler
2410         goto Lnoinsert;
2411 
2412     // Special case of:
2413     //  PUSH reg1
2414     //  MOV  reg2,x[ESP]
2415     if (c.Iop == 0x8B &&
2416         (c.Irm & modregrm(3,0,7)) == modregrm(1,0,4) &&
2417         c.Isib == modregrm(0,4,SP) &&
2418         c.IFL1 == FLconst &&
2419         (cast(byte)c.IEV1.Vpointer) >= REGSIZE
2420        )
2421     {
2422         movesp = 1;                     // this is a MOV reg2,offset[ESP]
2423         offset = cast(byte)c.IEV1.Vpointer;
2424         reg2 = (c.Irm >> 3) & 7;
2425     }
2426 
2427 
2428     // Start at tblmax, and back up until we get a conflict
2429     ic = -1;
2430     imin = 0;
2431     for (i = tblmax; i >= 0; i--)
2432     {
2433         Cinfo* cit = tbl[i];
2434         if (!cit)
2435             continue;
2436 
2437         // Look for special case swap
2438         if (movesp &&
2439             (cit.c.Iop & ~7) == 0x50 &&               // if PUSH reg1
2440             (cit.c.Iop & 7) != reg2 &&                // if reg1 != reg2
2441             (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust
2442            )
2443         {
2444             c.IEV1.Vpointer += cit.spadjust;
2445             //printf("\t1, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer);
2446             continue;
2447         }
2448 
2449         if (movesp &&
2450             cit.c.Iop == 0x83 &&
2451             cit.c.Irm == modregrm(3,5,SP) &&          // if SUB ESP,offset
2452             cit.c.IFL2 == FLconst &&
2453             (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust
2454            )
2455         {
2456             //printf("\t2, spadjust = %d\n",cit.spadjust);
2457             c.IEV1.Vpointer += cit.spadjust;
2458             continue;
2459         }
2460 
2461         clocks = conflict(cit,ci,1);
2462         if (clocks)
2463         {   int j;
2464 
2465             ic = i;                     // where the conflict occurred
2466             clocks &= 0xFF;             // convert to delay count
2467 
2468             // Move forward the delay clocks
2469             if (clocks == 0)
2470                 j = i + 1;
2471             else if (PRO)
2472                 j = (((i + 3) / 3) * 3) + clocks * 3;
2473             else
2474             {   j = ((i + 2) & ~1) + clocks * 2;
2475 
2476                 // It's possible we skipped over some AGI generating
2477                 // instructions due to movesp.
2478                 int k;
2479                 for (k = i + 1; k < j; k++)
2480                 {
2481                     if (k >= TBLMAX)
2482                         goto Lnoinsert;
2483                     if (tbl[k] && pair_agi(*tbl[k], *ci))
2484                     {
2485                         k = ((k + 2) & ~1) + 1;
2486                     }
2487                 }
2488                 j = k;
2489             }
2490 
2491             if (j >= TBLMAX)                    // exceed table size?
2492                 goto Lnoinsert;
2493             imin = j;                           // first possible slot c can go in
2494             break;
2495         }
2496     }
2497 
2498 
2499     // Scan forward looking for a hole to put it in
2500     for (i = imin; i < TBLMAX; i++)
2501     {
2502         if (tbl[i])
2503         {
2504             // In case, due to movesp, we skipped over some AGI instructions
2505             if (!PRO && pair_agi(*tbl[i], *ci))
2506             {
2507                 i = ((i + 2) & ~1) + 1;
2508                 if (i >= TBLMAX)
2509                     goto Lnoinsert;
2510             }
2511         }
2512         else
2513         {
2514             if (PRO)
2515             {   int i0 = (i / 3) * 3;           // index of decode unit 0
2516                 Cinfo *ci0;
2517 
2518                 assert(((TBLMAX / 3) * 3) == TBLMAX);
2519                 switch (i - i0)
2520                 {
2521                     case 0:                     // i0 can handle any instruction
2522                         goto Linsert;
2523                     case 1:
2524                         ci0 = tbl[i0];
2525                         if (ci.uops > 1)
2526                         {
2527                             if (i0 >= imin && ci0.uops == 1)
2528                                 goto L1;
2529                             i++;
2530                             break;
2531                         }
2532                         if (triple_test(ci0,ci,tbl[i0 + 2]))
2533                             goto Linsert;
2534                         break;
2535                     case 2:
2536                         ci0 = tbl[i0];
2537                         if (ci.uops > 1)
2538                         {
2539                             if (i0 >= imin && ci0.uops == 1)
2540                             {
2541                                 if (i >= tblmax)
2542                                 {   if (i + 1 >= TBLMAX)
2543                                         goto Lnoinsert;
2544                                     tblmax = i + 1;
2545                                 }
2546                                 tbl[i0 + 2] = tbl[i0 + 1];
2547                                 tbl[i0 + 1] = ci0;
2548                                 i = i0;
2549                                 goto Linsert;
2550                             }
2551                             break;
2552                         }
2553                         if (triple_test(ci0,tbl[i0 + 1],ci))
2554                             goto Linsert;
2555                         break;
2556                     default:
2557                         assert(0);
2558                 }
2559             }
2560             else
2561             {
2562                 assert((TBLMAX & 1) == 0);
2563                 if (i & 1)                      // if V pipe
2564                 {
2565                     if (pair_test(*tbl[i - 1], *ci))
2566                     {
2567                         goto Linsert;
2568                     }
2569                     else if (i > imin && pair_test(*ci, *tbl[i - 1]))
2570                     {
2571                 L1:
2572                         tbl[i] = tbl[i - 1];
2573                         if (i >= tblmax)
2574                             tblmax = i + 1;
2575                         i--;
2576                         //printf("\tswapping with x%02x\n",tbl[i + 1].c.Iop);
2577                         goto Linsert;
2578                     }
2579                 }
2580                 else                    // will always fit in U pipe
2581                 {
2582                     assert(!tbl[i + 1]);        // because V pipe should be empty
2583                     goto Linsert;
2584                 }
2585             }
2586         }
2587     }
2588 
2589 Lnoinsert:
2590     //printf("\tnoinsert\n");
2591     c.IEV1.Vpointer = vpointer;  // reset to original value
2592     return 0;
2593 
2594 Linsert:
2595     // Insert at location i
2596     assert(i < TBLMAX);
2597     assert(tblmax <= TBLMAX);
2598     tbl[i] = ci;
2599     //printf("\tinsert at location %d\n",i);
2600 
2601     // If it's a scheduled floating point code, we have to adjust
2602     // the FXCH values
2603     if (ci.fp_op)
2604     {
2605         ci.fxch_pre = 0;
2606         ci.fxch_post = 0;                      // start over again
2607 
2608         int fpu = fpustackused;
2609         for (int j = 0; j < tblmax; j++)
2610         {
2611             if (tbl[j])
2612             {
2613                 fpu += tbl[j].fpuadjust;
2614                 if (fpu >= 8)                   // if FPU stack overflow
2615                 {   tbl[i] = null;
2616                     //printf("fpu stack overflow\n");
2617                     goto Lnoinsert;
2618                 }
2619             }
2620         }
2621 
2622         for (int j = tblmax; j > i; j--)
2623         {
2624             if (j < TBLMAX && tbl[j])
2625                 conflict(tbl[j],ci,2);
2626         }
2627     }
2628 
2629     if (movesp)
2630     {   // Adjust [ESP] offsets
2631 
2632         //printf("\tic = %d, inserting at %d\n",ic,i);
2633         assert(cast(uint)tblmax <= TBLMAX);
2634         for (int j = ic + 1; j < i; j++)
2635         {
2636             Cinfo* cit = tbl[j];
2637             if (cit)
2638             {
2639                 c.IEV1.Vpointer -= cit.spadjust;
2640                 //printf("\t3, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer);
2641             }
2642         }
2643     }
2644     if (i >= tblmax)
2645         tblmax = i + 1;
2646 
2647     // Now do a hack. Look back at immediately preceding instructions,
2648     // and see if we can swap with a push.
2649     if (0 && movesp)
2650     {
2651         while (1)
2652         {
2653             int j;
2654             for (j = 1; i > j; j++)
2655                 if (tbl[i - j])
2656                     break;
2657 
2658             if (i >= j && tbl[i - j] &&
2659                    (tbl[i - j].c.Iop & ~7) == 0x50 &&       // if PUSH reg1
2660                    (tbl[i - j].c.Iop & 7) != reg2 &&  // if reg1 != reg2
2661                    cast(byte)c.IEV1.Vpointer >= REGSIZE)
2662             {
2663                 //printf("\t-4 prec, i-j=%d, i=%d\n",i-j,i);
2664                 assert(cast(uint)i < TBLMAX);
2665                 assert(cast(uint)(i - j) < TBLMAX);
2666                 tbl[i] = tbl[i - j];
2667                 tbl[i - j] = ci;
2668                 i -= j;
2669                 c.IEV1.Vpointer -= REGSIZE;
2670             }
2671             else
2672                 break;
2673         }
2674     }
2675 
2676     //printf("\tinsert\n");
2677     return 1;
2678 }
2679 
2680 /******************************
2681  * Insert c into staging area.
2682  * Params:
2683  *      c = instruction to stage
2684  * Returns:
2685  *      false if could not be scheduled; have to start a new one
2686  */
2687 
2688 @trusted
2689 bool stage(code *c)
2690 {
2691     //printf("stage: "); c.print();
2692     if (cinfomax == TBLMAX)             // if out of space
2693         return false;
2694     auto ci = &cinfo[cinfomax++];
2695     getinfo(*ci,c);
2696 
2697     if (c.Iflags & (CFtarg | CFtarg2 | CFvolatile | CFvex))
2698     {
2699         // Insert anything in stagelist
2700         foreach (ref cs;  stagelist[])
2701         {
2702             if (cs)
2703             {
2704                 if (!insert(cs))
2705                     return false;
2706                 cs = null;
2707             }
2708         }
2709         return insert(ci) != 0;
2710     }
2711 
2712     // Look through stagelist, and insert any AGI conflicting instructions
2713     bool agi = false;
2714     foreach (ref cs; stagelist[])
2715     {
2716         if (cs)
2717         {
2718             if (pair_agi(*cs, *ci))
2719             {
2720                 if (!insert(cs))
2721                     goto Lnostage;
2722                 cs = null;
2723                 agi = true;                    // we put out an AGI
2724             }
2725         }
2726     }
2727 
2728     // Look through stagelist, and insert any other conflicting instructions
2729     foreach (i, ref cs; stagelist[])
2730     {
2731         if (!cs)
2732             continue;
2733         if (conflict(cs,ci,0) &&                // if conflict
2734             !(cs.flags & ci.flags & CIFL.push))
2735         {
2736             if (cs.spadjust)
2737             {
2738                 // We need to insert all previous adjustments to ESP
2739                 foreach (ref ca; stagelist[0 .. i])
2740                 {
2741                     if (ca && ca.spadjust)
2742                     {
2743                         if (!insert(ca))
2744                             goto Lnostage;
2745                         ca = null;
2746                     }
2747                 }
2748             }
2749 
2750             if (!insert(cs))
2751                 goto Lnostage;
2752             cs = null;
2753         }
2754     }
2755 
2756     // If floating point opcode, don't stage it, send it right out
2757     if (!agi && ci.flags & CIFL.nostage)
2758     {
2759         if (!insert(ci))
2760             goto Lnostage;
2761         return true;
2762     }
2763 
2764     stagelist.push(ci);         // append to staging list
2765     return true;
2766 
2767 Lnostage:
2768     return false;
2769 }
2770 
2771 }
2772 
2773 
2774 
2775 /********************************************
2776  * Snip off tail of instruction sequence.
2777  * Returns:
2778  *      next instruction (the tail) or
2779  *      null for no more instructions
2780  */
2781 
2782 private code * csnip(code *c)
2783 {
2784     if (c)
2785     {
2786         uint iflags = c.Iflags & CFclassinit;
2787         code **pc;
2788         while (1)
2789         {
2790             pc = &c.next;
2791             c = *pc;
2792             if (!c)
2793                 break;
2794             if (c.Iflags & (CFtarg | CFtarg2))
2795                 break;
2796             if (!(c.Iop == NOP ||
2797                   c.Iop == (ESCAPE | ESClinnum) ||
2798                   c.Iflags & iflags))
2799                 break;
2800         }
2801         *pc = null;
2802     }
2803     return c;
2804 }
2805 
2806 
2807 /******************************
2808  * Schedule Pentium instructions,
2809  * based on Steve Russell's algorithm.
2810  */
2811 
2812 @trusted
2813 private code *schedule(code *c,regm_t scratch)
2814 {
2815     code *cresult = null;
2816     code **pctail = &cresult;
2817     Schedule sch = void;
2818 
2819     sch.initialize(0);                  // initialize scheduling table
2820     while (c)
2821     {
2822         if ((c.Iop == NOP ||
2823              ((c.Iop & ESCAPEmask) == ESCAPE && c.Iop != (ESCAPE | ESCadjfpu)) ||
2824              c.Iflags & CFclassinit) &&
2825             !(c.Iflags & (CFtarg | CFtarg2)))
2826         {   code *cn;
2827 
2828             // Just append this instruction to pctail and go to the next one
2829             *pctail = c;
2830             cn = code_next(c);
2831             c.next = null;
2832             pctail = &c.next;
2833             c = cn;
2834             continue;
2835         }
2836 
2837         //printf("init\n");
2838         sch.initialize(sch.fpustackused);       // initialize scheduling table
2839 
2840         while (c)
2841         {
2842             //printf("insert %p\n",c);
2843             if (!sch.stage(c))          // store c in scheduling table
2844                 break;
2845             c = csnip(c);
2846         }
2847 
2848         //printf("assem %d\n",sch.tblmax);
2849         pctail = sch.assemble(pctail);  // reassemble instruction stream
2850     }
2851     sch.dtor();
2852 
2853     return cresult;
2854 }
2855 
2856 /**************************************************************************/
2857 
2858 /********************************************
2859  * Replace any occurrence of r1 in EA with r2.
2860  */
2861 
2862 private void repEA(code *c,uint r1,uint r2)
2863 {
2864     uint mod,reg,rm;
2865     uint rmn;
2866 
2867     rmn = c.Irm;
2868     mod = rmn & 0xC0;
2869     reg = rmn & modregrm(0,7,0);
2870     rm =  rmn & 7;
2871 
2872     if (mod == 0xC0 && rm == r1)
2873     { }    //c.Irm = mod | reg | r2;
2874     else if (is32bitaddr(I32,c.Iflags) &&
2875         // If not disp32
2876         (rmn & modregrm(3,0,7)) != modregrm(0,0,5))
2877     {
2878         if (rm == 4)
2879         {   // SIB byte addressing
2880             uint sib;
2881             uint base;
2882             uint index;
2883 
2884             sib = c.Isib;
2885             base = sib & 7;
2886             index = (sib >> 3) & 7;
2887             if (base == r1 &&
2888                 !(r1 == 5 && mod == 0) &&
2889                 !(r2 == 5 && mod == 0)
2890                )
2891                 base = r2;
2892             if (index == r1)
2893                 index = r2;
2894             c.Isib = cast(ubyte)((sib & 0xC0) | (index << 3) | base);
2895         }
2896         else if (rm == r1)
2897         {
2898             if (r1 == BP && r2 == SP)
2899             {   // Replace [EBP] with [ESP]
2900                 c.Irm = cast(ubyte)(mod | reg | 4);
2901                 c.Isib = modregrm(0,4,SP);
2902             }
2903             else if (r2 == BP && mod == 0)
2904             {
2905                 c.Irm = cast(ubyte)(modregrm(1,0,0) | reg | r2);
2906                 c.IFL1 = FLconst;
2907                 c.IEV1.Vint = 0;
2908             }
2909             else
2910                 c.Irm = cast(ubyte)(mod | reg | r2);
2911         }
2912     }
2913 }
2914 
2915 /******************************************
2916  * Instruction scheduler.
2917  * Input:
2918  *      c               list of instructions to schedule
2919  *      scratch         scratch registers we can use
2920  * Returns:
2921  *      revised list of scheduled instructions
2922  */
2923 
2924 /******************************************
2925  * Swap c1 and c2.
2926  * c1 comes before c2.
2927  * Swap in place to not disturb addresses of jmp targets
2928  */
2929 
2930 private void code_swap(code *c1,code *c2)
2931 {   code cs;
2932 
2933     // Special case of:
2934     //  PUSH reg1
2935     //  MOV  reg2,x[ESP]
2936     //printf("code_swap(%x, %x)\n",c1,c2);
2937     if ((c1.Iop & ~7) == 0x50 &&
2938         c2.Iop == 0x8B &&
2939         (c2.Irm & modregrm(3,0,7)) == modregrm(1,0,4) &&
2940         c2.Isib == modregrm(0,4,SP) &&
2941         c2.IFL1 == FLconst &&
2942         (cast(byte)c2.IEV1.Vpointer) >= REGSIZE &&
2943         (c1.Iop & 7) != ((c2.Irm >> 3) & 7)
2944        )
2945         c2.IEV1.Vpointer -= REGSIZE;
2946 
2947 
2948     cs = *c2;
2949     *c2 = *c1;
2950     *c1 = cs;
2951     // Retain original CFtarg
2952     c1.Iflags = (c1.Iflags & ~(CFtarg | CFtarg2)) | (c2.Iflags & (CFtarg | CFtarg2));
2953     c2.Iflags = (c2.Iflags & ~(CFtarg | CFtarg2)) | (cs.Iflags & (CFtarg | CFtarg2));
2954 
2955     c1.next = c2.next;
2956     c2.next = cs.next;
2957 }
2958 
2959 private code *peephole(code *cstart,regm_t scratch)
2960 {
2961     // Look for cases of:
2962     //  MOV r1,r2
2963     //  OP ?,r1
2964     // we can replace with:
2965     //  MOV r1,r2
2966     //  OP ?,r2
2967     // to improve pairing
2968     code *c1;
2969     uint r1,r2;
2970     uint mod,reg,rm;
2971 
2972     //printf("peephole\n");
2973     for (code *c = cstart; c; c = c1)
2974     {
2975         ubyte rmn;
2976 
2977         //c.print();
2978         c1 = cnext(c);
2979     Ln:
2980         if (!c1)
2981             break;
2982         if (c1.Iflags & (CFtarg | CFtarg2))
2983             continue;
2984 
2985         // Do:
2986         //      PUSH    reg
2987         if (I32 && (c.Iop & ~7) == 0x50)
2988         {
2989             uint regx = c.Iop & 7;
2990 
2991             //  MOV     [ESP],regx       =>      NOP
2992             if (c1.Iop == 0x8B &&
2993                 c1.Irm == modregrm(0,regx,4) &&
2994                 c1.Isib == modregrm(0,4,SP))
2995             {   c1.Iop = NOP;
2996                 continue;
2997             }
2998 
2999             //  PUSH    [ESP]           =>      PUSH    regx
3000             if (c1.Iop == 0xFF &&
3001                 c1.Irm == modregrm(0,6,4) &&
3002                 c1.Isib == modregrm(0,4,SP))
3003             {   c1.Iop = 0x50 + regx;
3004                 continue;
3005             }
3006 
3007             //  CMP     [ESP],imm       =>      CMP     regx,i,,
3008             if (c1.Iop == 0x83 &&
3009                 c1.Irm == modregrm(0,7,4) &&
3010                 c1.Isib == modregrm(0,4,SP))
3011             {   c1.Irm = modregrm(3,7,regx);
3012                 if (c1.IFL2 == FLconst && cast(byte)c1.IEV2.Vuns == 0)
3013                 {   // to TEST regx,regx
3014                     c1.Iop = (c1.Iop & 1) | 0x84;
3015                     c1.Irm = modregrm(3,regx,regx);
3016                 }
3017                 continue;
3018             }
3019 
3020         }
3021 
3022         // Do:
3023         //      MOV     reg,[ESP]       =>      PUSH    reg
3024         //      ADD     ESP,4           =>      NOP
3025         if (I32 && c.Iop == 0x8B && (c.Irm & 0xC7) == modregrm(0,0,4) &&
3026             c.Isib == modregrm(0,4,SP) &&
3027             c1.Iop == 0x83 && (c1.Irm & 0xC7) == modregrm(3,0,SP) &&
3028             !(c1.Iflags & CFpsw) && c1.IFL2 == FLconst && c1.IEV2.Vint == 4)
3029         {
3030             uint regx = (c.Irm >> 3) & 7;
3031             c.Iop = 0x58 + regx;
3032             c1.Iop = NOP;
3033             continue;
3034         }
3035 
3036         // Combine two SUBs of the same register
3037         if (c.Iop == c1.Iop &&
3038             c.Iop == 0x83 &&
3039             (c.Irm & 0xC0) == 0xC0 &&
3040             (c.Irm & modregrm(3,0,7)) == (c1.Irm & modregrm(3,0,7)) &&
3041             !(c1.Iflags & CFpsw) &&
3042             c.IFL2 == FLconst && c1.IFL2 == FLconst
3043            )
3044         {   int i = cast(byte)c.IEV2.Vint;
3045             int i1 = cast(byte)c1.IEV2.Vint;
3046             switch ((c.Irm & modregrm(0,7,0)) | ((c1.Irm & modregrm(0,7,0)) >> 3))
3047             {
3048                 case (0 << 3) | 0:              // ADD, ADD
3049                 case (5 << 3) | 5:              // SUB, SUB
3050                     i += i1;
3051                     goto Laa;
3052                 case (0 << 3) | 5:              // ADD, SUB
3053                 case (5 << 3) | 0:              // SUB, ADD
3054                     i -= i1;
3055                     goto Laa;
3056                 Laa:
3057                     if (cast(byte)i != i)
3058                         c.Iop &= ~2;
3059                     c.IEV2.Vint = i;
3060                     c1.Iop = NOP;
3061                     if (i == 0)
3062                         c.Iop = NOP;
3063                     continue;
3064 
3065                 default:
3066                     break;
3067             }
3068         }
3069 
3070         if (c.Iop == 0x8B && (c.Irm & 0xC0) == 0xC0)    // MOV r1,r2
3071         {   r1 = (c.Irm >> 3) & 7;
3072             r2 = c.Irm & 7;
3073         }
3074         else if (c.Iop == 0x89 && (c.Irm & 0xC0) == 0xC0)   // MOV r1,r2
3075         {   r1 = c.Irm & 7;
3076             r2 = (c.Irm >> 3) & 7;
3077         }
3078         else
3079         {
3080             continue;
3081         }
3082 
3083         rmn = c1.Irm;
3084         mod = rmn & 0xC0;
3085         reg = rmn & modregrm(0,7,0);
3086         rm =  rmn & 7;
3087         if (c1.hasModregrm())
3088             repEA(c1,r1,r2);
3089         switch (c1.Iop)
3090         {
3091             case 0x50:
3092             case 0x51:
3093             case 0x52:
3094             case 0x53:
3095             case 0x54:
3096             case 0x55:
3097             case 0x56:
3098             case 0x57:                          // PUSH reg
3099                 if ((c1.Iop & 7) == r1)
3100                 {   c1.Iop = 0x50 | r2;
3101                     //printf("schedule PUSH reg\n");
3102                 }
3103                 break;
3104 
3105             case 0x81:
3106             case 0x83:
3107                 // Look for CMP EA,imm
3108                 if (reg == modregrm(0,7,0))
3109                 {
3110                     if (mod == 0xC0 && rm == r1)
3111                         c1.Irm = cast(ubyte)(mod | reg | r2);
3112                 }
3113                 break;
3114 
3115             case 0x84:                  // TEST reg,byte ptr EA
3116                 if (r1 >= 4 || r2 >= 4) // if not a byte register
3117                     break;
3118                 if ((rmn & 0xC0) == 0xC0)
3119                 {
3120                     if ((rmn & 3) == r1)
3121                     {   c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,4)) | r2);
3122                         //printf("schedule 1\n");
3123                     }
3124                 }
3125                 if ((rmn & modregrm(0,3,0)) == modregrm(0,r1,0))
3126                 {   c1.Irm = (rmn & modregrm(3,4,7)) | modregrm(0,r2,0);
3127                     //printf("schedule 2\n");
3128                 }
3129                 break;
3130             case 0x85:                  // TEST reg,word ptr EA
3131                 if ((rmn & 0xC0) == 0xC0)
3132                 {
3133                     if ((rmn & 7) == r1)
3134                     {   c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,0)) | r2);
3135                         //printf("schedule 3\n");
3136                     }
3137                 }
3138                 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0))
3139                 {   c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0);
3140                     //printf("schedule 4\n");
3141                 }
3142                 break;
3143 
3144             case 0x89:                  // MOV EA,reg
3145                 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0))
3146                 {   c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0);
3147                     //printf("schedule 5\n");
3148                     if (c1.Irm == modregrm(3,r2,r2))
3149                         goto Lnop;
3150                 }
3151                 break;
3152 
3153             case 0x8B:                  // MOV reg,EA
3154                 if ((rmn & 0xC0) == 0xC0 &&
3155                     (rmn & 7) == r1)            // if EA == r1
3156                 {   c1.Irm = cast(ubyte)((rmn & modregrm(3,7,0)) | r2);
3157                     //printf("schedule 6\n");
3158                     if (c1.Irm == modregrm(3,r2,r2))
3159                         goto Lnop;
3160                 }
3161                 break;
3162 
3163             case 0x3C:                  // CMP AL,imm8
3164                 if (r1 == AX && r2 < 4)
3165                 {   c1.Iop = 0x80;
3166                     c1.Irm = modregrm(3,7,r2);
3167                     //printf("schedule 7, r2 = %d\n", r2);
3168                 }
3169                 break;
3170 
3171             case 0x3D:                  // CMP AX,imm16
3172                 if (r1 == AX)
3173                 {   c1.Iop = 0x81;
3174                     c1.Irm = modregrm(3,7,r2);
3175                     if (c1.IFL2 == FLconst &&
3176                         c1.IEV2.Vuns == cast(byte)c1.IEV2.Vuns)
3177                         c1.Iop = 0x83;
3178                     //printf("schedule 8\n");
3179                 }
3180                 break;
3181 
3182             default:
3183                 break;
3184         }
3185         continue;
3186 Lnop:
3187         c1.Iop = NOP;
3188         c1 = cnext(c1);
3189         goto Ln;
3190     }
3191     return cstart;
3192 }
3193 
3194 /*****************************************************************/
3195 
3196 /**********************************************
3197  * Replace complex instructions with simple ones more conducive
3198  * to scheduling.
3199  */
3200 
3201 @trusted
3202 code *simpleops(code *c,regm_t scratch)
3203 {   code *cstart;
3204     uint reg;
3205     code *c2;
3206 
3207     // Worry about using registers not saved yet by prolog
3208     scratch &= ~fregsaved;
3209 
3210     if (!(scratch & (scratch - 1)))     // if 0 or 1 registers
3211         return c;
3212 
3213     reg = findreg(scratch);
3214 
3215     cstart = c;
3216     for (code** pc = &cstart; *pc; pc = &(*pc).next)
3217     {
3218         c = *pc;
3219         if (c.Iflags & (CFtarg | CFtarg2 | CFopsize))
3220             continue;
3221         if (c.Iop == 0x83 &&
3222             (c.Irm & modregrm(0,7,0)) == modregrm(0,7,0) &&
3223             (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0)
3224            )
3225         {   // Replace CMP mem,imm with:
3226             //  MOV reg,mem
3227             //  CMP reg,imm
3228             targ_long imm;
3229 
3230             //printf("replacing CMP\n");
3231             c.Iop = 0x8B;
3232             c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0);
3233 
3234             c2 = code_calloc();
3235             if (reg == AX)
3236                 c2.Iop = 0x3D;
3237             else
3238             {   c2.Iop = 0x83;
3239                 c2.Irm = modregrm(3,7,reg);
3240             }
3241             c2.IFL2 = c.IFL2;
3242             c2.IEV2 = c.IEV2;
3243 
3244             // See if c2 should be replaced by a TEST
3245             imm = c2.IEV2.Vuns;
3246             if (!(c2.Iop & 1))
3247                 imm &= 0xFF;
3248             else if (I32 ? c.Iflags & CFopsize : !(c.Iflags & CFopsize))
3249                 imm = cast(short) imm;
3250             if (imm == 0)
3251             {
3252                 c2.Iop = 0x85;                 // TEST reg,reg
3253                 c2.Irm = modregrm(3,reg,reg);
3254             }
3255             goto L1;
3256         }
3257         else if (c.Iop == 0xFF &&
3258             (c.Irm & modregrm(0,7,0)) == modregrm(0,6,0) &&
3259             (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0)
3260            )
3261         {   // Replace PUSH mem with:
3262             //  MOV reg,mem
3263             //  PUSH reg
3264 
3265            // printf("replacing PUSH\n");
3266             c.Iop = 0x8B;
3267             c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0);
3268 
3269             c2 = gen1(null,0x50 + reg);
3270         L1:
3271 //c.print();
3272 //c2.print();
3273             c2.next = c.next;
3274             c.next = c2;
3275 
3276             // Switch to another reg
3277             if (scratch & ~mask(reg))
3278                 reg = findreg(scratch & ~mask(reg));
3279         }
3280     }
3281     return cstart;
3282 }