1 /**
2  * Code generation 3
3  *
4  * Includes:
5  * - generating a function prolog (pushing return address, loading paramters)
6  * - generating a function epilog (restoring registers, returning)
7  * - generation / peephole optimizations of jump / branch instructions
8  *
9  * Compiler implementation of the
10  * $(LINK2 https://www.dlang.org, D programming language).
11  *
12  * Copyright:   Copyright (C) 1994-1998 by Symantec
13  *              Copyright (C) 2000-2023 by The D Language Foundation, All Rights Reserved
14  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
15  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
16  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cod3.d, backend/cod3.d)
17  * Documentation:  https://dlang.org/phobos/dmd_backend_cod3.html
18  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/backend/cod3.d
19  */
20 
21 module dmd.backend.cod3;
22 
23 import core.bitop;
24 import core.stdc.stdio;
25 import core.stdc.stdlib;
26 import core.stdc.string;
27 
28 import dmd.backend.backend;
29 import dmd.backend.barray;
30 import dmd.backend.cc;
31 import dmd.backend.cdef;
32 import dmd.backend.cgcse;
33 import dmd.backend.code;
34 import dmd.backend.code_x86;
35 import dmd.backend.codebuilder;
36 import dmd.backend.dlist;
37 import dmd.backend.dvec;
38 import dmd.backend.melf;
39 import dmd.backend.mem;
40 import dmd.backend.el;
41 import dmd.backend.global;
42 import dmd.backend.obj;
43 import dmd.backend.oper;
44 import dmd.backend.rtlsym;
45 import dmd.backend.symtab;
46 import dmd.backend.ty;
47 import dmd.backend.type;
48 import dmd.backend.xmm;
49 
50 
51 nothrow:
52 @safe:
53 
54 enum MARS = true;
55 
56 //private void genorreg(ref CodeBuilder c, uint t, uint f) { genregs(c, 0x09, f, t); }
57 
58 enum JMPJMPTABLE = false;               // benchmarking shows it's slower
59 
60 /*************
61  * Size in bytes of each instruction.
62  * 0 means illegal instruction.
63  * bit  M:      if there is a modregrm field (EV1 is reserved for modregrm)
64  * bit  T:      if there is a second operand (EV2)
65  * bit  E:      if second operand is only 8 bits
66  * bit  A:      a short version exists for the AX reg
67  * bit  R:      a short version exists for regs
68  * bits 2..0:   size of instruction (excluding optional bytes)
69  */
70 
71 enum
72 {
73     M = 0x80,
74     T = 0x40,
75     E = 0x20,
76     A = 0x10,
77     R = 0x08,
78     W = 0,
79 }
80 
81 private __gshared ubyte[256] inssize =
82 [       M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 00 */
83         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 08 */
84         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 10 */
85         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 18 */
86         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 20 */
87         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 28 */
88         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 30 */
89         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 38 */
90         1,1,1,1,                1,1,1,1,                /* 40 */
91         1,1,1,1,                1,1,1,1,                /* 48 */
92         1,1,1,1,                1,1,1,1,                /* 50 */
93         1,1,1,1,                1,1,1,1,                /* 58 */
94         1,1,M|2,M|2,            1,1,1,1,                /* 60 */
95         T|3,M|T|4,T|E|2,M|T|E|3, 1,1,1,1,               /* 68 */
96         T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* 70 */
97         T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* 78 */
98         M|T|E|A|3,M|T|A|4,M|T|E|3,M|T|E|3,      M|2,M|2,M|2,M|A|R|2, /* 80 */
99         M|A|2,M|A|2,M|A|2,M|A|2,        M|2,M|2,M|2,M|R|2,      /* 88 */
100         1,1,1,1,                1,1,1,1,                /* 90 */
101         1,1,T|5,1,              1,1,1,1,                /* 98 */
102 
103      // cod3_set32() patches this
104     //  T|5,T|5,T|5,T|5,        1,1,1,1,                /* A0 */
105         T|3,T|3,T|3,T|3,        1,1,1,1,                /* A0 */
106 
107         T|E|2,T|3,1,1,          1,1,1,1,                /* A8 */
108         T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* B0 */
109         T|3,T|3,T|3,T|3,        T|3,T|3,T|3,T|3,                /* B8 */
110         M|T|E|3,M|T|E|3,T|3,1,  M|2,M|2,M|T|E|R|3,M|T|R|4,      /* C0 */
111         T|E|4,1,T|3,1,          1,T|E|2,1,1,            /* C8 */
112         M|2,M|2,M|2,M|2,        T|E|2,T|E|2,0,1,        /* D0 */
113         /* For the floating instructions, allow room for the FWAIT      */
114         M|2,M|2,M|2,M|2,        M|2,M|2,M|2,M|2,        /* D8 */
115         T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* E0 */
116         T|3,T|3,T|5,T|E|2,              1,1,1,1,                /* E8 */
117         1,0,1,1,                1,1,M|A|2,M|A|2,                /* F0 */
118         1,1,1,1,                1,1,M|2,M|R|2                   /* F8 */
119 ];
120 
121 private __gshared const ubyte[256] inssize32 =
122 [       2,2,2,2,        2,5,1,1,                /* 00 */
123         2,2,2,2,        2,5,1,1,                /* 08 */
124         2,2,2,2,        2,5,1,1,                /* 10 */
125         2,2,2,2,        2,5,1,1,                /* 18 */
126         2,2,2,2,        2,5,1,1,                /* 20 */
127         2,2,2,2,        2,5,1,1,                /* 28 */
128         2,2,2,2,        2,5,1,1,                /* 30 */
129         2,2,2,2,        2,5,1,1,                /* 38 */
130         1,1,1,1,        1,1,1,1,                /* 40 */
131         1,1,1,1,        1,1,1,1,                /* 48 */
132         1,1,1,1,        1,1,1,1,                /* 50 */
133         1,1,1,1,        1,1,1,1,                /* 58 */
134         1,1,2,2,        1,1,1,1,                /* 60 */
135         5,6,2,3,        1,1,1,1,                /* 68 */
136         2,2,2,2,        2,2,2,2,                /* 70 */
137         2,2,2,2,        2,2,2,2,                /* 78 */
138         3,6,3,3,        2,2,2,2,                /* 80 */
139         2,2,2,2,        2,2,2,2,                /* 88 */
140         1,1,1,1,        1,1,1,1,                /* 90 */
141         1,1,7,1,        1,1,1,1,                /* 98 */
142         5,5,5,5,        1,1,1,1,                /* A0 */
143         2,5,1,1,        1,1,1,1,                /* A8 */
144         2,2,2,2,        2,2,2,2,                /* B0 */
145         5,5,5,5,        5,5,5,5,                /* B8 */
146         3,3,3,1,        2,2,3,6,                /* C0 */
147         4,1,3,1,        1,2,1,1,                /* C8 */
148         2,2,2,2,        2,2,0,1,                /* D0 */
149         /* For the floating instructions, don't need room for the FWAIT */
150         2,2,2,2,        2,2,2,2,                /* D8 */
151 
152         2,2,2,2,        2,2,2,2,                /* E0 */
153         5,5,7,2,        1,1,1,1,                /* E8 */
154         1,0,1,1,        1,1,2,2,                /* F0 */
155         1,1,1,1,        1,1,2,2                 /* F8 */
156 ];
157 
158 /* For 2 byte opcodes starting with 0x0F        */
159 private __gshared ubyte[256] inssize2 =
160 [       M|3,M|3,M|3,M|3,        2,2,2,2,                // 00
161         2,2,M|3,2,              2,M|3,2,M|T|E|4,        // 08
162         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 10
163         M|3,2,2,2,              2,2,2,2,                // 18
164         M|3,M|3,M|3,M|3,        M|3,2,M|3,2,            // 20
165         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 28
166         2,2,2,2,                2,2,2,2,                // 30
167         M|4,2,M|T|E|5,2,        2,2,2,2,                // 38
168         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 40
169         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 48
170         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 50
171         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 58
172         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 60
173         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 68
174         M|T|E|4,M|T|E|4,M|T|E|4,M|T|E|4, M|3,M|3,M|3,2, // 70
175         2,2,2,2,                M|3,M|3,M|3,M|3,        // 78
176         W|T|4,W|T|4,W|T|4,W|T|4, W|T|4,W|T|4,W|T|4,W|T|4, // 80
177         W|T|4,W|T|4,W|T|4,W|T|4, W|T|4,W|T|4,W|T|4,W|T|4, // 88
178         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // 90
179         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // 98
180         2,2,2,M|3,      M|T|E|4,M|3,2,2,        // A0
181         2,2,2,M|3,      M|T|E|4,M|3,M|3,M|3,    // A8
182         M|E|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,     // B0
183         M|3,2,M|T|E|4,M|3, M|3,M|3,M|3,M|3,     // B8
184         M|3,M|3,M|T|E|4,M|3, M|T|E|4,M|T|E|4,M|T|E|4,M|3,       // C0
185         2,2,2,2,        2,2,2,2,                // C8
186         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // D0
187         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // D8
188         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // E0
189         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // E8
190         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // F0
191         M|3,M|3,M|3,M|3, M|3,M|3,M|3,2          // F8
192 ];
193 
194 /*************************************************
195  * Generate code to save `reg` in `regsave` stack area.
196  * Params:
197  *      regsave = register save areay on stack
198  *      cdb = where to write generated code
199  *      reg = register to save
200  *      idx = set to location in regsave for use in REGSAVE_restore()
201  */
202 
203 @trusted
204 void REGSAVE_save(ref REGSAVE regsave, ref CodeBuilder cdb, reg_t reg, out uint idx)
205 {
206     if (isXMMreg(reg))
207     {
208         regsave.alignment = 16;
209         regsave.idx = (regsave.idx + 15) & ~15;
210         idx = regsave.idx;
211         regsave.idx += 16;
212         // MOVD idx[RBP],xmm
213         opcode_t op = STOAPD;
214         if (TARGET_LINUX && I32)
215             // Haven't yet figured out why stack is not aligned to 16
216             op = STOUPD;
217         cdb.genc1(op,modregxrm(2, reg - XMM0, BPRM),FLregsave,cast(targ_uns) idx);
218     }
219     else
220     {
221         if (!regsave.alignment)
222             regsave.alignment = REGSIZE;
223         idx = regsave.idx;
224         regsave.idx += REGSIZE;
225         // MOV idx[RBP],reg
226         cdb.genc1(0x89,modregxrm(2, reg, BPRM),FLregsave,cast(targ_uns) idx);
227         if (I64)
228             code_orrex(cdb.last(), REX_W);
229     }
230     reflocal = true;
231     if (regsave.idx > regsave.top)
232         regsave.top = regsave.idx;              // keep high water mark
233 }
234 
235 /*******************************
236  * Restore `reg` from `regsave` area.
237  * Complement REGSAVE_save().
238  */
239 
240 @trusted
241 void REGSAVE_restore(const ref REGSAVE regsave, ref CodeBuilder cdb, reg_t reg, uint idx)
242 {
243     if (isXMMreg(reg))
244     {
245         assert(regsave.alignment == 16);
246         // MOVD xmm,idx[RBP]
247         opcode_t op = LODAPD;
248         if (TARGET_LINUX && I32)
249             // Haven't yet figured out why stack is not aligned to 16
250             op = LODUPD;
251         cdb.genc1(op,modregxrm(2, reg - XMM0, BPRM),FLregsave,cast(targ_uns) idx);
252     }
253     else
254     {   // MOV reg,idx[RBP]
255         cdb.genc1(0x8B,modregxrm(2, reg, BPRM),FLregsave,cast(targ_uns) idx);
256         if (I64)
257             code_orrex(cdb.last(), REX_W);
258     }
259 }
260 
261 /************************************
262  * Size for vex encoded instruction.
263  */
264 
265 @trusted
266 ubyte vex_inssize(code *c)
267 {
268     assert(c.Iflags & CFvex && c.Ivex.pfx == 0xC4);
269     ubyte ins;
270     if (c.Iflags & CFvex3)
271     {
272         switch (c.Ivex.mmmm)
273         {
274         case 0: // no prefix
275         case 1: // 0F
276             ins = cast(ubyte)(inssize2[c.Ivex.op] + 2);
277             break;
278         case 2: // 0F 38
279             ins = cast(ubyte)(inssize2[0x38] + 1);
280             break;
281         case 3: // 0F 3A
282             ins = cast(ubyte)(inssize2[0x3A] + 1);
283             break;
284         default:
285             printf("Iop = %x mmmm = %x\n", c.Iop, c.Ivex.mmmm);
286             assert(0);
287         }
288     }
289     else
290     {
291         ins = cast(ubyte)(inssize2[c.Ivex.op] + 1);
292     }
293     return ins;
294 }
295 
296 /************************************
297  * Determine if there is a modregrm byte for instruction.
298  * Params:
299  *      c = instruction
300  * Returns:
301  *      true if has modregrm byte
302  */
303 
304 @trusted
305 bool hasModregrm(scope const code* c)
306 {
307     uint ins;
308     opcode_t op1 = c.Iop & 0xFF;
309     if (op1 == ESCAPE)
310         ins = 0;
311     else if ((c.Iop & 0xFFFD00) == 0x0F3800)
312         ins = inssize2[(c.Iop >> 8) & 0xFF];
313     else if ((c.Iop & 0xFF00) == 0x0F00)
314         ins = inssize2[op1];
315     else
316         ins = inssize[op1];
317     return (ins & M) != 0;
318 }
319 
320 /********************************
321  * setup ALLREGS and BYTEREGS
322  * called by: codgen
323  */
324 
325 @trusted
326 void cod3_initregs()
327 {
328     if (I64)
329     {
330         ALLREGS = mAX|mBX|mCX|mDX|mSI|mDI| mR8|mR9|mR10|mR11|mR12|mR13|mR14|mR15;
331         BYTEREGS = ALLREGS;
332     }
333     else
334     {
335         ALLREGS = ALLREGS_INIT;
336         BYTEREGS = BYTEREGS_INIT;
337     }
338 }
339 
340 /********************************
341  * set initial global variable values
342  */
343 
344 @trusted
345 void cod3_setdefault()
346 {
347     fregsaved = mBP | mSI | mDI;
348 }
349 
350 /********************************
351  * Fix global variables for 386.
352  */
353 @trusted
354 void cod3_set32()
355 {
356     inssize[0xA0] = T|5;
357     inssize[0xA1] = T|5;
358     inssize[0xA2] = T|5;
359     inssize[0xA3] = T|5;
360     BPRM = 5;                       /* [EBP] addressing mode        */
361     fregsaved = mBP | mBX | mSI | mDI;      // saved across function calls
362     FLOATREGS = FLOATREGS_32;
363     FLOATREGS2 = FLOATREGS2_32;
364     DOUBLEREGS = DOUBLEREGS_32;
365     if (config.flags3 & CFG3eseqds)
366         fregsaved |= mES;
367 
368     foreach (ref v; inssize2[0x80 .. 0x90])
369         v = W|T|6;
370 
371     TARGET_STACKALIGN = config.fpxmmregs ? 16 : 4;
372 }
373 
374 /********************************
375  * Fix global variables for I64.
376  */
377 
378 @trusted
379 void cod3_set64()
380 {
381     inssize[0xA0] = T|5;                // MOV AL,mem
382     inssize[0xA1] = T|5;                // MOV RAX,mem
383     inssize[0xA2] = T|5;                // MOV mem,AL
384     inssize[0xA3] = T|5;                // MOV mem,RAX
385     BPRM = 5;                           // [RBP] addressing mode
386 
387     fregsaved = (config.exe & EX_windos)
388         ? mBP | mBX | mDI | mSI | mR12 | mR13 | mR14 | mR15 | mES | mXMM6 | mXMM7 // also XMM8..15;
389         : mBP | mBX | mR12 | mR13 | mR14 | mR15 | mES;      // saved across function calls
390 
391     FLOATREGS = FLOATREGS_64;
392     FLOATREGS2 = FLOATREGS2_64;
393     DOUBLEREGS = DOUBLEREGS_64;
394 
395     ALLREGS = mAX|mBX|mCX|mDX|mSI|mDI|  mR8|mR9|mR10|mR11|mR12|mR13|mR14|mR15;
396     BYTEREGS = ALLREGS;
397 
398     foreach (ref v; inssize2[0x80 .. 0x90])
399         v = W|T|6;
400 
401     TARGET_STACKALIGN = config.fpxmmregs ? 16 : 8;
402 }
403 
404 /*********************************
405  * Word or dword align start of function.
406  * Params:
407  *      seg = segment to write alignment bytes to
408  *      nbytes = number of alignment bytes to write
409  */
410 @trusted
411 void cod3_align_bytes(int seg, size_t nbytes)
412 {
413     /* Table 4-2 from Intel Instruction Set Reference M-Z
414      * 1 bytes NOP                                        90
415      * 2 bytes 66 NOP                                     66 90
416      * 3 bytes NOP DWORD ptr [EAX]                        0F 1F 00
417      * 4 bytes NOP DWORD ptr [EAX + 00H]                  0F 1F 40 00
418      * 5 bytes NOP DWORD ptr [EAX + EAX*1 + 00H]          0F 1F 44 00 00
419      * 6 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00H]       66 0F 1F 44 00 00
420      * 7 bytes NOP DWORD ptr [EAX + 00000000H]            0F 1F 80 00 00 00 00
421      * 8 bytes NOP DWORD ptr [EAX + EAX*1 + 00000000H]    0F 1F 84 00 00 00 00 00
422      * 9 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00
423      * only for CPUs: CPUID.01H.EAX[Bytes 11:8] = 0110B or 1111B
424      */
425 
426     assert(SegData[seg].SDseg == seg);
427 
428     while (nbytes)
429     {   size_t n = nbytes;
430         const(char)* p;
431 
432         if (nbytes > 1 && (I64 || config.fpxmmregs))
433         {
434             switch (n)
435             {
436                 case 2:  p = "\x66\x90"; break;
437                 case 3:  p = "\x0F\x1F\x00"; break;
438                 case 4:  p = "\x0F\x1F\x40\x00"; break;
439                 case 5:  p = "\x0F\x1F\x44\x00\x00"; break;
440                 case 6:  p = "\x66\x0F\x1F\x44\x00\x00"; break;
441                 case 7:  p = "\x0F\x1F\x80\x00\x00\x00\x00"; break;
442                 case 8:  p = "\x0F\x1F\x84\x00\x00\x00\x00\x00"; break;
443                 default: p = "\x66\x0F\x1F\x84\x00\x00\x00\x00\x00"; n = 9; break;
444             }
445         }
446         else
447         {
448             static immutable ubyte[15] nops = [
449                 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90
450             ]; // XCHG AX,AX
451             if (n > nops.length)
452                 n = nops.length;
453             p = cast(char*)nops;
454         }
455         objmod.write_bytes(SegData[seg],p[0 .. n]);
456         nbytes -= n;
457     }
458 }
459 
460 /****************************
461  * Align start of function.
462  * Params:
463  *      seg = segment of function
464  */
465 @trusted
466 void cod3_align(int seg)
467 {
468     if (config.exe & EX_windos)
469     {
470         if (config.flags4 & CFG4speed)      // if optimized for speed
471         {
472             // Pick alignment based on CPU target
473             if (config.target_cpu == TARGET_80486 ||
474                 config.target_cpu >= TARGET_PentiumPro)
475             {   // 486 does reads on 16 byte boundaries, so if we are near
476                 // such a boundary, align us to it
477 
478                 const nbytes = -Offset(seg) & 15;
479                 if (nbytes < 8)
480                     cod3_align_bytes(seg, nbytes);
481             }
482         }
483     }
484     else
485     {
486         const nbytes = -Offset(seg) & 7;
487         cod3_align_bytes(seg, nbytes);
488     }
489 }
490 
491 
492 /**********************************
493  * Generate code to adjust the stack pointer by `nbytes`
494  * Params:
495  *      cdb = code builder
496  *      nbytes = number of bytes to adjust stack pointer
497  */
498 void cod3_stackadj(ref CodeBuilder cdb, int nbytes)
499 {
500     //printf("cod3_stackadj(%d)\n", nbytes);
501     uint grex = I64 ? REX_W << 16 : 0;
502     uint rm;
503     if (nbytes > 0)
504         rm = modregrm(3,5,SP); // SUB ESP,nbytes
505     else
506     {
507         nbytes = -nbytes;
508         rm = modregrm(3,0,SP); // ADD ESP,nbytes
509     }
510     cdb.genc2(0x81, grex | rm, nbytes);
511 }
512 
513 /**********************************
514  * Generate code to align the stack pointer at `nbytes`
515  * Params:
516  *      cdb = code builder
517  *      nbytes = number of bytes to align stack pointer
518  */
519 void cod3_stackalign(ref CodeBuilder cdb, int nbytes)
520 {
521     //printf("cod3_stackalign(%d)\n", nbytes);
522     const grex = I64 ? REX_W << 16 : 0;
523     const rm = modregrm(3, 4, SP);             // AND ESP,-nbytes
524     cdb.genc2(0x81, grex | rm, -nbytes);
525 }
526 
527 /* Constructor that links the ModuleReference to the head of
528  * the list pointed to by _Dmoduleref
529  *
530  * For ELF object files.
531  */
532 static if (0)
533 {
534 void cod3_buildmodulector(OutBuffer* buf, int codeOffset, int refOffset)
535 {
536     /*      ret
537      * codeOffset:
538      *      pushad
539      *      mov     EAX,&ModuleReference
540      *      mov     ECX,_DmoduleRef
541      *      mov     EDX,[ECX]
542      *      mov     [EAX],EDX
543      *      mov     [ECX],EAX
544      *      popad
545      *      ret
546      */
547 
548     const int seg = CODE;
549 
550     if (I64 && config.flags3 & CFG3pic)
551     {   // LEA RAX,ModuleReference[RIP]
552         buf.writeByte(REX | REX_W);
553         buf.writeByte(LEA);
554         buf.writeByte(modregrm(0,AX,5));
555         codeOffset += 3;
556         codeOffset += Obj.writerel(seg, codeOffset, R_X86_64_PC32, 3 /*STI_DATA*/, refOffset - 4);
557 
558         // MOV RCX,_DmoduleRef@GOTPCREL[RIP]
559         buf.writeByte(REX | REX_W);
560         buf.writeByte(0x8B);
561         buf.writeByte(modregrm(0,CX,5));
562         codeOffset += 3;
563         codeOffset += Obj.writerel(seg, codeOffset, R_X86_64_GOTPCREL, Obj.external_def("_Dmodule_ref"), -4);
564     }
565     else
566     {
567         /* movl ModuleReference*, %eax */
568         buf.writeByte(0xB8);
569         codeOffset += 1;
570         const uint reltype = I64 ? R_X86_64_32 : R_386_32;
571         codeOffset += Obj.writerel(seg, codeOffset, reltype, 3 /*STI_DATA*/, refOffset);
572 
573         /* movl _Dmodule_ref, %ecx */
574         buf.writeByte(0xB9);
575         codeOffset += 1;
576         codeOffset += Obj.writerel(seg, codeOffset, reltype, Obj.external_def("_Dmodule_ref"), 0);
577     }
578 
579     if (I64)
580         buf.writeByte(REX | REX_W);
581     buf.writeByte(0x8B); buf.writeByte(0x11); /* movl (%ecx), %edx */
582     if (I64)
583         buf.writeByte(REX | REX_W);
584     buf.writeByte(0x89); buf.writeByte(0x10); /* movl %edx, (%eax) */
585     if (I64)
586         buf.writeByte(REX | REX_W);
587     buf.writeByte(0x89); buf.writeByte(0x01); /* movl %eax, (%ecx) */
588 
589     buf.writeByte(0xC3); /* ret */
590 }
591 }
592 
593 /*****************************
594  * Given a type, return a mask of
595  * registers to hold that type.
596  * Input:
597  *      tyf     function type
598  */
599 
600 @trusted
601 regm_t regmask(tym_t tym, tym_t tyf)
602 {
603     switch (tybasic(tym))
604     {
605         case TYvoid:
606         case TYnoreturn:
607         case TYstruct:
608         case TYarray:
609             return 0;
610 
611         case TYbool:
612         case TYwchar_t:
613         case TYchar16:
614         case TYchar:
615         case TYschar:
616         case TYuchar:
617         case TYshort:
618         case TYushort:
619         case TYint:
620         case TYuint:
621         case TYnullptr:
622         case TYnptr:
623         case TYnref:
624         case TYsptr:
625         case TYcptr:
626         case TYimmutPtr:
627         case TYsharePtr:
628         case TYrestrictPtr:
629         case TYfgPtr:
630             return mAX;
631 
632         case TYfloat:
633         case TYifloat:
634             if (I64)
635                 return mXMM0;
636             if (config.exe & EX_flat)
637                 return mST0;
638             goto case TYlong;
639 
640         case TYlong:
641         case TYulong:
642         case TYdchar:
643             if (!I16)
644                 return mAX;
645             goto case TYfptr;
646 
647         case TYfptr:
648         case TYhptr:
649             return mDX | mAX;
650 
651         case TYcent:
652         case TYucent:
653             assert(I64);
654             return mDX | mAX;
655 
656         case TYvptr:
657             return mDX | mBX;
658 
659         case TYdouble:
660         case TYdouble_alias:
661         case TYidouble:
662             if (I64)
663                 return mXMM0;
664             if (config.exe & EX_flat)
665                 return mST0;
666             return DOUBLEREGS;
667 
668         case TYllong:
669         case TYullong:
670             return I64 ? cast(regm_t) mAX : (I32 ? mDX | mAX : DOUBLEREGS);
671 
672         case TYldouble:
673         case TYildouble:
674             return mST0;
675 
676         case TYcfloat:
677             if (config.exe & EX_posix && I32 && tybasic(tyf) == TYnfunc)
678                 return mDX | mAX;
679             goto case TYcdouble;
680 
681         case TYcdouble:
682             if (I64)
683                 return mXMM0 | mXMM1;
684             goto case TYcldouble;
685 
686         case TYcldouble:
687             return mST01;
688 
689         // SIMD vector types
690         case TYfloat4:
691         case TYdouble2:
692         case TYschar16:
693         case TYuchar16:
694         case TYshort8:
695         case TYushort8:
696         case TYlong4:
697         case TYulong4:
698         case TYllong2:
699         case TYullong2:
700 
701         case TYfloat8:
702         case TYdouble4:
703         case TYschar32:
704         case TYuchar32:
705         case TYshort16:
706         case TYushort16:
707         case TYlong8:
708         case TYulong8:
709         case TYllong4:
710         case TYullong4:
711             if (!config.fpxmmregs)
712             {   printf("SIMD operations not supported on this platform\n");
713                 exit(1);
714             }
715             return mXMM0;
716 
717         default:
718             debug printf("%s\n", tym_str(tym));
719             assert(0);
720     }
721 }
722 
723 /*******************************
724  * setup register allocator parameters with platform specific data
725  */
726 void cgreg_dst_regs(reg_t* dst_integer_reg, reg_t* dst_float_reg)
727 {
728     *dst_integer_reg = AX;
729     *dst_float_reg   = XMM0;
730 }
731 
732 @trusted
733 void cgreg_set_priorities(tym_t ty, const(reg_t)** pseq, const(reg_t)** pseqmsw)
734 {
735     //printf("cgreg_set_priorities %x\n", ty);
736     const sz = tysize(ty);
737 
738     if (tyxmmreg(ty))
739     {
740         static immutable ubyte[9] sequence = [XMM0,XMM1,XMM2,XMM3,XMM4,XMM5,XMM6,XMM7,NOREG];
741         *pseq = sequence.ptr;
742     }
743     else if (I64)
744     {
745         if (sz == REGSIZE * 2)
746         {
747             static immutable ubyte[3] seqmsw1 = [CX,DX,NOREG];
748             static immutable ubyte[5] seqlsw1 = [AX,BX,SI,DI,NOREG];
749             *pseq = seqlsw1.ptr;
750             *pseqmsw = seqmsw1.ptr;
751         }
752         else
753         {   // R10 is reserved for the static link
754             static immutable ubyte[15] sequence2 = [AX,CX,DX,SI,DI,R8,R9,R11,BX,R12,R13,R14,R15,BP,NOREG];
755             *pseq = cast(ubyte*)sequence2.ptr;
756         }
757     }
758     else if (I32)
759     {
760         if (sz == REGSIZE * 2)
761         {
762             static immutable ubyte[5] seqlsw3 = [AX,BX,SI,DI,NOREG];
763             static immutable ubyte[3] seqmsw3 = [CX,DX,NOREG];
764             *pseq = seqlsw3.ptr;
765             *pseqmsw = seqmsw3.ptr;
766         }
767         else
768         {
769             static immutable ubyte[8] sequence4 = [AX,CX,DX,BX,SI,DI,BP,NOREG];
770             *pseq = sequence4.ptr;
771         }
772     }
773     else
774     {   assert(I16);
775         if (typtr(ty))
776         {
777             // For pointer types, try to pick index register first
778             static immutable ubyte[8] seqidx5 = [BX,SI,DI,AX,CX,DX,BP,NOREG];
779             *pseq = seqidx5.ptr;
780         }
781         else
782         {
783             // Otherwise, try to pick index registers last
784             static immutable ubyte[8] sequence6 = [AX,CX,DX,BX,SI,DI,BP,NOREG];
785             *pseq = sequence6.ptr;
786         }
787     }
788 }
789 
790 /*******************************************
791  * Call finally block.
792  * Params:
793  *      bf = block to call
794  *      retregs = registers to preserve across call
795  * Returns:
796  *      code generated
797  */
798 @trusted
799 private code *callFinallyBlock(block *bf, regm_t retregs)
800 {
801     CodeBuilder cdbs; cdbs.ctor();
802     CodeBuilder cdbr; cdbr.ctor();
803     int nalign = 0;
804 
805     calledFinally = true;
806     uint npush = gensaverestore(retregs,cdbs,cdbr);
807 
808     if (STACKALIGN >= 16)
809     {   npush += REGSIZE;
810         if (npush & (STACKALIGN - 1))
811         {   nalign = STACKALIGN - (npush & (STACKALIGN - 1));
812             cod3_stackadj(cdbs, nalign);
813         }
814     }
815     cdbs.genc(0xE8,0,0,0,FLblock,cast(targ_size_t)bf);
816     regcon.immed.mval = 0;
817     if (nalign)
818         cod3_stackadj(cdbs, -nalign);
819     cdbs.append(cdbr);
820     return cdbs.finish();
821 }
822 
823 /*******************************
824  * Generate block exit code
825  */
826 @trusted
827 void outblkexitcode(ref CodeBuilder cdb, block *bl, ref int anyspill, const(char)* sflsave, Symbol** retsym, const regm_t mfuncregsave)
828 {
829     CodeBuilder cdb2; cdb2.ctor();
830     elem *e = bl.Belem;
831     block *nextb;
832     regm_t retregs = 0;
833 
834     if (bl.BC != BCasm)
835         assert(bl.Bcode == null);
836 
837     switch (bl.BC)                     /* block exit condition         */
838     {
839         case BCiftrue:
840         {
841             bool jcond = true;
842             block *bs1 = bl.nthSucc(0);
843             block *bs2 = bl.nthSucc(1);
844             if (bs1 == bl.Bnext)
845             {   // Swap bs1 and bs2
846                 block *btmp;
847 
848                 jcond ^= 1;
849                 btmp = bs1;
850                 bs1 = bs2;
851                 bs2 = btmp;
852             }
853             logexp(cdb,e,jcond,FLblock,cast(code *) bs1);
854             nextb = bs2;
855         }
856         L5:
857             if (configv.addlinenumbers && bl.Bsrcpos.Slinnum &&
858                 !(funcsym_p.ty() & mTYnaked))
859             {
860                 //printf("BCiftrue: %s(%u)\n", bl.Bsrcpos.Sfilename ? bl.Bsrcpos.Sfilename : "", bl.Bsrcpos.Slinnum);
861                 cdb.genlinnum(bl.Bsrcpos);
862             }
863             if (nextb != bl.Bnext)
864             {
865                 assert(!(bl.Bflags & BFLepilog));
866                 genjmp(cdb,JMP,FLblock,nextb);
867             }
868             break;
869 
870         case BCjmptab:
871         case BCifthen:
872         case BCswitch:
873         {
874             assert(!(bl.Bflags & BFLepilog));
875             doswitch(cdb,bl);               // hide messy details
876             break;
877         }
878         case BCjcatch:          // D catch clause of try-catch
879             assert(ehmethod(funcsym_p) != EHmethod.EH_NONE);
880             // Mark all registers as destroyed. This will prevent
881             // register assignments to variables used in catch blocks.
882             getregs(cdb,lpadregs());
883 
884             if (config.ehmethod == EHmethod.EH_DWARF)
885             {
886                 /* Each block must have ESP set to the same value it was at the end
887                  * of the prolog. But the unwinder calls catch blocks with ESP set
888                  * at the value it was when the throwing function was called, which
889                  * may have arguments pushed on the stack.
890                  * This instruction will reset ESP to the correct offset from EBP.
891                  */
892                 cdb.gen1(ESCAPE | ESCfixesp);
893             }
894             goto case_goto;
895         case BCgoto:
896             nextb = bl.nthSucc(0);
897             if ((MARS ||
898                  funcsym_p.Sfunc.Fflags3 & Fnteh) &&
899                 ehmethod(funcsym_p) != EHmethod.EH_DWARF &&
900                 bl.Btry != nextb.Btry &&
901                 nextb.BC != BC_finally)
902             {
903                 regm_t retregsx = 0;
904                 gencodelem(cdb,e,&retregsx,true);
905                 int toindex = nextb.Btry ? nextb.Btry.Bscope_index : -1;
906                 assert(bl.Btry);
907                 int fromindex = bl.Btry.Bscope_index;
908                 if (toindex + 1 == fromindex)
909                 {   // Simply call __finally
910                     if (bl.Btry &&
911                         bl.Btry.nthSucc(1).BC == BCjcatch)
912                     {
913                         goto L5;        // it's a try-catch, not a try-finally
914                     }
915                 }
916                 if (config.ehmethod == EHmethod.EH_WIN32 && !(funcsym_p.Sfunc.Fflags3 & Feh_none) ||
917                     config.ehmethod == EHmethod.EH_SEH)
918                 {
919                     nteh_unwind(cdb,0,toindex);
920                 }
921                 else
922                 {
923                 if (toindex + 1 <= fromindex)
924                 {
925                     //c = cat(c, linux_unwind(0, toindex));
926                     block *bt;
927 
928                     //printf("B%d: fromindex = %d, toindex = %d\n", bl.Bdfoidx, fromindex, toindex);
929                     bt = bl;
930                     while ((bt = bt.Btry) != null && bt.Bscope_index != toindex)
931                     {   block *bf;
932 
933                         //printf("\tbt.Bscope_index = %d, bt.Blast_index = %d\n", bt.Bscope_index, bt.Blast_index);
934                         bf = bt.nthSucc(1);
935                         // Only look at try-finally blocks
936                         if (bf.BC == BCjcatch)
937                             continue;
938 
939                         if (bf == nextb)
940                             continue;
941                         //printf("\tbf = B%d, nextb = B%d\n", bf.Bdfoidx, nextb.Bdfoidx);
942                         if (nextb.BC == BCgoto &&
943                             !nextb.Belem &&
944                             bf == nextb.nthSucc(0))
945                             continue;
946 
947                         // call __finally
948                         cdb.append(callFinallyBlock(bf.nthSucc(0), retregsx));
949                     }
950                 }
951                 }
952                 goto L5;
953             }
954         case_goto:
955         {
956             regm_t retregsx = 0;
957             gencodelem(cdb,e,&retregsx,true);
958             if (anyspill)
959             {   // Add in the epilog code
960                 CodeBuilder cdbstore; cdbstore.ctor();
961                 CodeBuilder cdbload;  cdbload.ctor();
962 
963                 for (int i = 0; i < anyspill; i++)
964                 {   Symbol *s = globsym[i];
965 
966                     if (s.Sflags & SFLspill &&
967                         vec_testbit(dfoidx,s.Srange))
968                     {
969                         s.Sfl = sflsave[i];    // undo block register assignments
970                         cgreg_spillreg_epilog(bl,s,cdbstore,cdbload);
971                     }
972                 }
973                 cdb.append(cdbstore);
974                 cdb.append(cdbload);
975             }
976             nextb = bl.nthSucc(0);
977             goto L5;
978         }
979 
980         case BC_try:
981             if (config.ehmethod == EHmethod.EH_NONE || funcsym_p.Sfunc.Fflags3 & Feh_none)
982             {
983                 /* Need to use frame pointer to access locals, not the stack pointer,
984                  * because we'll be calling the BC_finally blocks and the stack will be off.
985                  */
986                 needframe = 1;
987             }
988             else if (config.ehmethod == EHmethod.EH_SEH || config.ehmethod == EHmethod.EH_WIN32)
989             {
990                 usednteh |= NTEH_try;
991                 nteh_usevars();
992             }
993             else
994                 usednteh |= EHtry;
995             goto case_goto;
996 
997         case BC_finally:
998             if (ehmethod(funcsym_p) == EHmethod.EH_DWARF)
999             {
1000                 // Mark scratch registers as destroyed.
1001                 getregsNoSave(lpadregs());
1002 
1003                 regm_t retregsx = 0;
1004                 gencodelem(cdb,bl.Belem,&retregsx,true);
1005 
1006                 // JMP bl.nthSucc(1)
1007                 nextb = bl.nthSucc(1);
1008 
1009                 goto L5;
1010             }
1011             else
1012             {
1013                 if (config.ehmethod == EHmethod.EH_SEH ||
1014                     config.ehmethod == EHmethod.EH_WIN32 && !(funcsym_p.Sfunc.Fflags3 & Feh_none))
1015                 {
1016                     // Mark all registers as destroyed. This will prevent
1017                     // register assignments to variables used in finally blocks.
1018                     getregsNoSave(lpadregs());
1019                 }
1020 
1021                 assert(!e);
1022                 // Generate CALL to finalizer code
1023                 cdb.append(callFinallyBlock(bl.nthSucc(0), 0));
1024 
1025                 // JMP bl.nthSucc(1)
1026                 nextb = bl.nthSucc(1);
1027 
1028                 goto L5;
1029             }
1030 
1031         case BC_lpad:
1032         {
1033             assert(ehmethod(funcsym_p) == EHmethod.EH_DWARF);
1034             // Mark all registers as destroyed. This will prevent
1035             // register assignments to variables used in finally blocks.
1036             getregsNoSave(lpadregs());
1037 
1038             regm_t retregsx = 0;
1039             gencodelem(cdb,bl.Belem,&retregsx,true);
1040 
1041             // JMP bl.nthSucc(0)
1042             nextb = bl.nthSucc(0);
1043             goto L5;
1044         }
1045 
1046         case BC_ret:
1047         {
1048             regm_t retregsx = 0;
1049             gencodelem(cdb,e,&retregsx,true);
1050             if (ehmethod(funcsym_p) == EHmethod.EH_DWARF)
1051             {
1052             }
1053             else
1054                 cdb.gen1(0xC3);   // RET
1055             break;
1056         }
1057 
1058 static if (NTEXCEPTIONS)
1059 {
1060         case BC_except:
1061         {
1062             assert(!e);
1063             usednteh |= NTEH_except;
1064             nteh_setsp(cdb,0x8B);
1065             getregsNoSave(allregs);
1066             nextb = bl.nthSucc(0);
1067             goto L5;
1068         }
1069         case BC_filter:
1070         {
1071             nteh_filter(cdb, bl);
1072             // Mark all registers as destroyed. This will prevent
1073             // register assignments to variables used in filter blocks.
1074             getregsNoSave(allregs);
1075             regm_t retregsx = regmask(e.Ety, TYnfunc);
1076             gencodelem(cdb,e,&retregsx,true);
1077             cdb.gen1(0xC3);   // RET
1078             break;
1079         }
1080 }
1081 
1082         case BCretexp:
1083             reg_t reg1, reg2, lreg, mreg;
1084             retregs = allocretregs(e.Ety, e.ET, funcsym_p.ty(), reg1, reg2);
1085             //printf("allocretregs returns %s\n", regm_str(mask(reg1) | mask(reg2)));
1086 
1087             lreg = mreg = NOREG;
1088             if (reg1 == NOREG)
1089             {}
1090             else if (tybasic(e.Ety) == TYcfloat)
1091                 lreg = ST01;
1092             else if (mask(reg1) & (mST0 | mST01))
1093                 lreg = reg1;
1094             else if (reg2 == NOREG)
1095                 lreg = reg1;
1096             else if (mask(reg1) & XMMREGS)
1097             {
1098                 lreg = XMM0;
1099                 mreg = XMM1;
1100             }
1101             else
1102             {
1103                 lreg = mask(reg1) & mLSW ? reg1 : AX;
1104                 mreg = mask(reg2) & mMSW ? reg2 : DX;
1105             }
1106             if (reg1 != NOREG)
1107                 retregs = (mask(lreg) | mask(mreg)) & ~mask(NOREG);
1108 
1109             // For the final load into the return regs, don't set regcon.used,
1110             // so that the optimizer can potentially use retregs for register
1111             // variable assignments.
1112 
1113             if (config.flags4 & CFG4optimized)
1114             {   regm_t usedsave;
1115 
1116                 docommas(cdb,e);
1117                 usedsave = regcon.used;
1118                 if (!OTleaf(e.Eoper))
1119                     gencodelem(cdb,e,&retregs,true);
1120                 else
1121                 {
1122                     if (e.Eoper == OPconst)
1123                         regcon.mvar = 0;
1124                     gencodelem(cdb,e,&retregs,true);
1125                     regcon.used = usedsave;
1126                     if (e.Eoper == OPvar)
1127                     {   Symbol *s = e.EV.Vsym;
1128 
1129                         if (s.Sfl == FLreg && s.Sregm != mAX)
1130                             *retsym = s;
1131                     }
1132                 }
1133             }
1134             else
1135             {
1136                 gencodelem(cdb,e,&retregs,true);
1137             }
1138 
1139             if (reg1 == NOREG)
1140             {
1141             }
1142             else if ((mask(reg1) | mask(reg2)) & (mST0 | mST01))
1143             {
1144                 assert(reg1 == lreg && reg2 == NOREG);
1145                 regm_t pretregs = mask(reg1) | mask(reg2);
1146                 fixresult87(cdb, e, retregs, &pretregs, true);
1147             }
1148             // fix return registers
1149             else if (tybasic(e.Ety) == TYcfloat)
1150             {
1151                 assert(lreg == ST01);
1152                 if (I64)
1153                 {
1154                     assert(reg2 == NOREG);
1155                     // spill
1156                     pop87();
1157                     pop87();
1158                     cdb.genfltreg(0xD9, 3, tysize(TYfloat));
1159                     genfwait(cdb);
1160                     cdb.genfltreg(0xD9, 3, 0);
1161                     genfwait(cdb);
1162                     // reload
1163                     if (config.exe == EX_WIN64)
1164                     {
1165                         assert(reg1 == AX);
1166                         cdb.genfltreg(LOD, reg1, 0);
1167                         code_orrex(cdb.last(), REX_W);
1168                     }
1169                     else
1170                     {
1171                         assert(reg1 == XMM0);
1172                         cdb.genxmmreg(xmmload(TYdouble), reg1, 0, TYdouble);
1173                     }
1174                 }
1175                 else
1176                 {
1177                     assert(reg1 == AX && reg2 == DX);
1178                     regm_t pretregs = mask(reg1) | mask(reg2);
1179                     fixresult_complex87(cdb, e, retregs, &pretregs, true);
1180                 }
1181             }
1182             else if (reg2 == NOREG)
1183                 assert(lreg == reg1);
1184             else for (int v = 0; v < 2; v++)
1185             {
1186                 if (v ^ (reg1 != mreg))
1187                     genmovreg(cdb, reg1, lreg);
1188                 else
1189                     genmovreg(cdb, reg2, mreg);
1190             }
1191             if (reg1 != NOREG)
1192                 retregs = (mask(reg1) | mask(reg2)) & ~mask(NOREG);
1193             goto L4;
1194 
1195         case BCret:
1196             retregs = 0;
1197             gencodelem(cdb,e,&retregs,true);
1198         L4:
1199             if (retregs == mST0)
1200             {   assert(global87.stackused == 1);
1201                 pop87();                // account for return value
1202             }
1203             else if (retregs == mST01)
1204             {   assert(global87.stackused == 2);
1205                 pop87();
1206                 pop87();                // account for return value
1207             }
1208 
1209             if (MARS || usednteh & NTEH_try)
1210             {
1211                 block *bt = bl;
1212                 while ((bt = bt.Btry) != null)
1213                 {
1214                     block *bf = bt.nthSucc(1);
1215                     // Only look at try-finally blocks
1216                     if (bf.BC == BCjcatch)
1217                     {
1218                         continue;
1219                     }
1220                     if (config.ehmethod == EHmethod.EH_WIN32 && !(funcsym_p.Sfunc.Fflags3 & Feh_none) ||
1221                         config.ehmethod == EHmethod.EH_SEH)
1222                     {
1223                         if (bt.Bscope_index == 0)
1224                         {
1225                             // call __finally
1226                             CodeBuilder cdbs; cdbs.ctor();
1227                             CodeBuilder cdbr; cdbr.ctor();
1228 
1229                             nteh_gensindex(cdb,-1);
1230                             gensaverestore(retregs,cdbs,cdbr);
1231                             cdb.append(cdbs);
1232                             cdb.genc(0xE8,0,0,0,FLblock,cast(targ_size_t)bf.nthSucc(0));
1233                             regcon.immed.mval = 0;
1234                             cdb.append(cdbr);
1235                         }
1236                         else
1237                         {
1238                             nteh_unwind(cdb,retregs,~0);
1239                         }
1240                         break;
1241                     }
1242                     else
1243                     {
1244                         // call __finally
1245                         cdb.append(callFinallyBlock(bf.nthSucc(0), retregs));
1246                     }
1247                 }
1248             }
1249             break;
1250 
1251         case BCexit:
1252             retregs = 0;
1253             gencodelem(cdb,e,&retregs,true);
1254             if (config.flags4 & CFG4optimized)
1255                 mfuncreg = mfuncregsave;
1256             break;
1257 
1258         case BCasm:
1259         {
1260             assert(!e);
1261             // Mark destroyed registers
1262             CodeBuilder cdbx; cdbx.ctor();
1263             getregs(cdbx,iasm_regs(bl));         // mark destroyed registers
1264             code *c = cdbx.finish();
1265             if (bl.Bsucc)
1266             {   nextb = bl.nthSucc(0);
1267                 if (!bl.Bnext)
1268                 {
1269                     cdb.append(bl.Bcode);
1270                     cdb.append(c);
1271                     goto L5;
1272                 }
1273                 if (nextb != bl.Bnext &&
1274                     bl.Bnext &&
1275                     !(bl.Bnext.BC == BCgoto &&
1276                      !bl.Bnext.Belem &&
1277                      nextb == bl.Bnext.nthSucc(0)))
1278                 {
1279                     // See if already have JMP at end of block
1280                     code *cl = code_last(bl.Bcode);
1281                     if (!cl || cl.Iop != JMP)
1282                     {
1283                         cdb.append(bl.Bcode);
1284                         cdb.append(c);
1285                         goto L5;        // add JMP at end of block
1286                     }
1287                 }
1288             }
1289             cdb.append(bl.Bcode);
1290             break;
1291         }
1292 
1293         default:
1294             debug
1295             printf("bl.BC = %d\n",bl.BC);
1296             assert(0);
1297     }
1298 }
1299 
1300 /***************************
1301  * Allocate registers for function return values.
1302  *
1303  * Params:
1304  *    ty    = return type
1305  *    t     = return type extended info
1306  *    tyf   = function type
1307  *    reg1  = set to the first part register, else NOREG
1308  *    reg2  = set to the second part register, else NOREG
1309  *
1310  * Returns:
1311  *    a bit mask of return registers.
1312  *    0 if function returns on the stack or returns void.
1313  */
1314 @trusted
1315 regm_t allocretregs(const tym_t ty, type* t, const tym_t tyf, out reg_t reg1, out reg_t reg2)
1316 {
1317     //printf("allocretregs() ty: %s\n", tym_str(ty));
1318     reg1 = reg2 = NOREG;
1319 
1320     if (!(config.exe & EX_posix))
1321         return regmask(ty, tyf);    // for non-Posix ABI
1322 
1323     /* The rest is for the Itanium ABI
1324      */
1325 
1326     const tyb = tybasic(ty);
1327     if (tyb == TYvoid || tyb == TYnoreturn)
1328         return 0;
1329 
1330     tym_t ty1 = tyb;
1331     tym_t ty2 = TYMAX;  // stays TYMAX if only one register is needed
1332 
1333     if (ty & mTYxmmgpr)
1334     {
1335         ty1 = TYdouble;
1336         ty2 = TYllong;
1337     }
1338     else if (ty & mTYgprxmm)
1339     {
1340         ty1 = TYllong;
1341         ty2 = TYdouble;
1342     }
1343 
1344     if (tyb == TYstruct)
1345     {
1346         assert(t);
1347         ty1 = t.Tty;
1348     }
1349 
1350     const tyfb = tybasic(tyf);
1351     switch (tyrelax(ty1))
1352     {
1353         case TYcent:
1354             if (I32)
1355                 return 0;
1356             ty1 = ty2 = TYllong;
1357             break;
1358 
1359         case TYcdouble:
1360             if (tyfb == TYjfunc && I32)
1361                 break;
1362             if (I32)
1363                 return 0;
1364             ty1 = ty2 = TYdouble;
1365             break;
1366 
1367         case TYcfloat:
1368             if (tyfb == TYjfunc && I32)
1369                 break;
1370             if (I32)
1371                 goto case TYllong;
1372             ty1 = TYdouble;
1373             break;
1374 
1375         case TYcldouble:
1376             if (tyfb == TYjfunc && I32)
1377                 break;
1378             if (I32)
1379                 return 0;
1380             break;
1381 
1382         case TYllong:
1383             if (I32)
1384                 ty1 = ty2 = TYlong;
1385             break;
1386 
1387         case TYarray:
1388             type* targ1, targ2;
1389             argtypes(t, targ1, targ2);
1390             if (targ1)
1391                 ty1 = targ1.Tty;
1392             else
1393                 return 0;
1394             if (targ2)
1395                 ty2 = targ2.Tty;
1396             break;
1397 
1398         case TYstruct:
1399             assert(t);
1400             if (I64)
1401             {
1402                 assert(tybasic(t.Tty) == TYstruct);
1403                 if (const targ1 = t.Ttag.Sstruct.Sarg1type)
1404                     ty1 = targ1.Tty;
1405                 else
1406                     return 0;
1407                 if (const targ2 = t.Ttag.Sstruct.Sarg2type)
1408                     ty2 = targ2.Tty;
1409                 break;
1410             }
1411             return 0;
1412 
1413         default:
1414             break;
1415     }
1416 
1417     /* now we have ty1 and ty2, use that to determine which register
1418      * is used for ty1 and which for ty2
1419      */
1420 
1421     static struct RetRegsAllocator
1422     {
1423     nothrow:
1424         static immutable reg_t[2] gpr_regs = [AX, DX];
1425         static immutable reg_t[2] xmm_regs = [XMM0, XMM1];
1426 
1427         uint cntgpr = 0,
1428              cntxmm = 0;
1429 
1430         reg_t gpr() { return gpr_regs[cntgpr++]; }
1431         reg_t xmm() { return xmm_regs[cntxmm++]; }
1432     }
1433 
1434     RetRegsAllocator rralloc;
1435 
1436     reg_t allocreg(tym_t tym)
1437     {
1438         if (tym == TYMAX)
1439             return NOREG;
1440         switch (tysize(tym))
1441         {
1442         case 1:
1443         case 2:
1444         case 4:
1445             if (tyfloating(tym))
1446                 return I64 ? rralloc.xmm() : ST0;
1447             else
1448                 return rralloc.gpr();
1449 
1450         case 8:
1451             if (tycomplex(tym))
1452             {
1453                 assert(tyfb == TYjfunc && I32);
1454                 return ST01;
1455             }
1456             else if (tysimd(tym))
1457             {
1458                 return rralloc.xmm();
1459             }
1460             assert(I64 || tyfloating(tym));
1461             goto case 4;
1462 
1463         default:
1464             if (tybasic(tym) == TYldouble || tybasic(tym) == TYildouble)
1465             {
1466                 return ST0;
1467             }
1468             else if (tybasic(tym) == TYcldouble)
1469             {
1470                 return ST01;
1471             }
1472             else if (tycomplex(tym) && tyfb == TYjfunc && I32)
1473             {
1474                 return ST01;
1475             }
1476             else if (tysimd(tym))
1477             {
1478                 return rralloc.xmm();
1479             }
1480 
1481             debug printf("%s\n", tym_str(tym));
1482             assert(0);
1483         }
1484     }
1485 
1486     reg1 = allocreg(ty1);
1487     reg2 = allocreg(ty2);
1488 
1489     return (mask(reg1) | mask(reg2)) & ~mask(NOREG);
1490 }
1491 
1492 /***********************************************
1493  * Struct necessary for sorting switch cases.
1494  */
1495 
1496 private alias _compare_fp_t = extern(C) nothrow int function(const void*, const void*);
1497 extern(C) void qsort(void* base, size_t nmemb, size_t size, _compare_fp_t compar);
1498 
1499 extern (C)  // qsort cmp functions need to be "C"
1500 {
1501 struct CaseVal
1502 {
1503     targ_ullong val;
1504     block *target;
1505 
1506     /* Sort function for qsort() */
1507     @trusted
1508     extern (C) static nothrow pure @nogc int cmp(scope const(void*) p, scope const(void*) q)
1509     {
1510         const(CaseVal)* c1 = cast(const(CaseVal)*)p;
1511         const(CaseVal)* c2 = cast(const(CaseVal)*)q;
1512         return (c1.val < c2.val) ? -1 : ((c1.val == c2.val) ? 0 : 1);
1513     }
1514 }
1515 }
1516 
1517 /***
1518  * Generate comparison of [reg2,reg] with val
1519  */
1520 @trusted
1521 private void cmpval(ref CodeBuilder cdb, targ_llong val, uint sz, reg_t reg, reg_t reg2, reg_t sreg)
1522 {
1523     if (I64 && sz == 8)
1524     {
1525         assert(reg2 == NOREG);
1526         if (val == cast(int)val)    // if val is a 64 bit value sign-extended from 32 bits
1527         {
1528             cdb.genc2(0x81,modregrmx(3,7,reg),cast(targ_size_t)val);     // CMP reg,value32
1529             cdb.last().Irex |= REX_W;                  // 64 bit operand
1530         }
1531         else
1532         {
1533             assert(sreg != NOREG);
1534             movregconst(cdb,sreg,cast(targ_size_t)val,64);  // MOV sreg,val64
1535             genregs(cdb,0x3B,reg,sreg);    // CMP reg,sreg
1536             code_orrex(cdb.last(), REX_W);
1537             getregsNoSave(mask(sreg));                  // don't remember we loaded this constant
1538         }
1539     }
1540     else if (reg2 == NOREG)
1541         cdb.genc2(0x81,modregrmx(3,7,reg),cast(targ_size_t)val);         // CMP reg,casevalue
1542     else
1543     {
1544         cdb.genc2(0x81,modregrm(3,7,reg2),cast(targ_size_t)MSREG(val));  // CMP reg2,MSREG(casevalue)
1545         code *cnext = gennop(null);
1546         genjmp(cdb,JNE,FLcode,cast(block *) cnext);  // JNE cnext
1547         cdb.genc2(0x81,modregrm(3,7,reg),cast(targ_size_t)val);          // CMP reg,casevalue
1548         cdb.append(cnext);
1549     }
1550 }
1551 
1552 @trusted extern (D)
1553 private void ifthen(ref CodeBuilder cdb, scope CaseVal[] casevals,
1554         uint sz, reg_t reg, reg_t reg2, reg_t sreg, block *bdefault, bool last)
1555 {
1556     const ncases = casevals.length;
1557     if (ncases >= 4 && config.flags4 & CFG4speed)
1558     {
1559         size_t pivot = ncases >> 1;
1560 
1561         // Compares for casevals[0..pivot]
1562         CodeBuilder cdb1; cdb1.ctor();
1563         ifthen(cdb1, casevals[0 .. pivot], sz, reg, reg2, sreg, bdefault, true);
1564 
1565         // Compares for casevals[pivot+1..ncases]
1566         CodeBuilder cdb2; cdb2.ctor();
1567         ifthen(cdb2, casevals[pivot + 1 .. $], sz, reg, reg2, sreg, bdefault, last);
1568         code *c2 = gennop(null);
1569 
1570         // Compare for caseval[pivot]
1571         cmpval(cdb, casevals[pivot].val, sz, reg, reg2, sreg);
1572         genjmp(cdb,JE,FLblock,casevals[pivot].target); // JE target
1573         // Note uint jump here, as cases were sorted using uint comparisons
1574         genjmp(cdb,JA,FLcode,cast(block *) c2);           // JG c2
1575 
1576         cdb.append(cdb1);
1577         cdb.append(c2);
1578         cdb.append(cdb2);
1579     }
1580     else
1581     {   // Not worth doing a binary search, just do a sequence of CMP/JE
1582         foreach (size_t n; 0 .. ncases)
1583         {
1584             targ_llong val = casevals[n].val;
1585             cmpval(cdb, val, sz, reg, reg2, sreg);
1586             code *cnext = null;
1587             if (reg2 != NOREG)
1588             {
1589                 cnext = gennop(null);
1590                 genjmp(cdb,JNE,FLcode,cast(block *) cnext);  // JNE cnext
1591                 cdb.genc2(0x81,modregrm(3,7,reg2),cast(targ_size_t)MSREG(val));   // CMP reg2,MSREG(casevalue)
1592             }
1593             genjmp(cdb,JE,FLblock,casevals[n].target);   // JE caseaddr
1594             cdb.append(cnext);
1595         }
1596 
1597         if (last)       // if default is not next block
1598             genjmp(cdb,JMP,FLblock,bdefault);
1599     }
1600 }
1601 
1602 /*******************************
1603  * Generate code for blocks ending in a switch statement.
1604  * Take BCswitch and decide on
1605  *      BCifthen        use if - then code
1606  *      BCjmptab        index into jump table
1607  *      BCswitch        search table for match
1608  */
1609 
1610 @trusted
1611 void doswitch(ref CodeBuilder cdb, block *b)
1612 {
1613     // If switch tables are in code segment and we need a CS: override to get at them
1614     bool csseg = cast(bool)(config.flags & CFGromable);
1615 
1616     //printf("doswitch(%d)\n", b.BC);
1617     elem *e = b.Belem;
1618     elem_debug(e);
1619     docommas(cdb,e);
1620     cgstate.stackclean++;
1621     tym_t tys = tybasic(e.Ety);
1622     int sz = _tysize[tys];
1623     bool dword = (sz == 2 * REGSIZE);
1624     targ_ulong msw;
1625     bool mswsame = true;                // assume all msw's are the same
1626 
1627     targ_llong vmax = long.min;         // smallest possible llong
1628     targ_llong vmin = long.max;         // largest possible llong
1629     foreach (n, val; b.Bswitch)         // find max and min case values
1630     {
1631         if (val > vmax) vmax = val;
1632         if (val < vmin) vmin = val;
1633         if (REGSIZE == 2)
1634         {
1635             ushort ms = (val >> 16) & 0xFFFF;
1636             if (n == 0)
1637                 msw = ms;
1638             else if (msw != ms)
1639                 mswsame = false;
1640         }
1641         else // REGSIZE == 4
1642         {
1643             targ_ulong ms = (val >> 32) & 0xFFFFFFFF;
1644             if (n == 0)
1645                 msw = ms;
1646             else if (msw != ms)
1647                 mswsame = false;
1648         }
1649     }
1650     //dbg_printf("vmax = x%lx, vmin = x%lx, vmax-vmin = x%lx\n",vmax,vmin,vmax - vmin);
1651 
1652     /* Three kinds of switch strategies - pick one
1653      */
1654     const ncases = b.Bswitch.length;
1655     if (ncases <= 3)
1656         goto Lifthen;
1657     else if (I16 && cast(targ_ullong)(vmax - vmin) <= ncases * 2)
1658         goto Ljmptab;           // >=50% of the table is case values, rest is default
1659     else if (config.flags3 & CFG3ibt)
1660         goto Lifthen;           // no jump table for ENDBR
1661     else if (cast(targ_ullong)(vmax - vmin) <= ncases * 3)
1662         goto Ljmptab;           // >= 33% of the table is case values, rest is default
1663     else if (I16)
1664         goto Lswitch;
1665     else
1666         goto Lifthen;
1667 
1668     /*************************************************************************/
1669     {   // generate if-then sequence
1670     Lifthen:
1671         regm_t retregs = ALLREGS;
1672         b.BC = BCifthen;
1673         scodelem(cdb,e,&retregs,0,true);
1674         reg_t reg, reg2;
1675         if (dword)
1676         {   reg = findreglsw(retregs);
1677             reg2 = findregmsw(retregs);
1678         }
1679         else
1680         {
1681             reg = findreg(retregs);     // reg that result is in
1682             reg2 = NOREG;
1683         }
1684         list_t bl = b.Bsucc;
1685         block *bdefault = b.nthSucc(0);
1686         if (dword && mswsame)
1687         {
1688             cdb.genc2(0x81,modregrm(3,7,reg2),msw);   // CMP reg2,MSW
1689             genjmp(cdb,JNE,FLblock,bdefault);  // JNE default
1690             reg2 = NOREG;
1691         }
1692 
1693         reg_t sreg = NOREG;                          // may need a scratch register
1694 
1695         // Put into casevals[0..ncases] so we can sort then slice
1696 
1697         import dmd.common.string : SmallBuffer;
1698         CaseVal[10] tmp = void;
1699         auto sb = SmallBuffer!(CaseVal)(ncases, tmp[]);
1700         CaseVal[] casevals = sb[];
1701 
1702         foreach (n, val; b.Bswitch)
1703         {
1704             casevals[n].val = val;
1705             bl = list_next(bl);
1706             casevals[n].target = list_block(bl);
1707 
1708             // See if we need a scratch register
1709             if (sreg == NOREG && I64 && sz == 8 && val != cast(int)val)
1710             {   regm_t regm = ALLREGS & ~mask(reg);
1711                 allocreg(cdb,&regm, &sreg, TYint);
1712             }
1713         }
1714 
1715         // Sort cases so we can do a runtime binary search
1716         qsort(casevals.ptr, casevals.length, CaseVal.sizeof, &CaseVal.cmp);
1717 
1718         //for (uint n = 0; n < ncases; n++)
1719             //printf("casevals[%lld] = x%x\n", n, casevals[n].val);
1720 
1721         // Generate binary tree of comparisons
1722         ifthen(cdb, casevals, sz, reg, reg2, sreg, bdefault, bdefault != b.Bnext);
1723 
1724         cgstate.stackclean--;
1725         return;
1726     }
1727 
1728     /*************************************************************************/
1729     {
1730         // Use switch value to index into jump table
1731     Ljmptab:
1732         //printf("Ljmptab:\n");
1733 
1734         b.BC = BCjmptab;
1735 
1736         /* If vmin is small enough, we can just set it to 0 and the jump
1737          * table entries from 0..vmin-1 can be set with the default target.
1738          * This saves the SUB instruction.
1739          * Must be same computation as used in outjmptab().
1740          */
1741         if (vmin > 0 && vmin <= _tysize[TYint])
1742             vmin = 0;
1743 
1744         b.Btablesize = cast(int) (vmax - vmin + 1) * tysize(TYnptr);
1745         regm_t retregs = IDXREGS;
1746         if (dword)
1747             retregs |= mMSW;
1748         if (config.exe & EX_posix && I32 && config.flags3 & CFG3pic)
1749             retregs &= ~mBX;                            // need EBX for GOT
1750         bool modify = (I16 || I64 || vmin);
1751         scodelem(cdb,e,&retregs,0,!modify);
1752         reg_t reg = findreg(retregs & IDXREGS); // reg that result is in
1753         reg_t reg2;
1754         if (dword)
1755             reg2 = findregmsw(retregs);
1756         if (modify)
1757         {
1758             assert(!(retregs & regcon.mvar));
1759             getregs(cdb,retregs);
1760         }
1761         if (vmin)                       // if there is a minimum
1762         {
1763             cdb.genc2(0x81,modregrm(3,5,reg),cast(targ_size_t)vmin); // SUB reg,vmin
1764             if (dword)
1765             {   cdb.genc2(0x81,modregrm(3,3,reg2),cast(targ_size_t)MSREG(vmin)); // SBB reg2,vmin
1766                 genjmp(cdb,JNE,FLblock,b.nthSucc(0)); // JNE default
1767             }
1768         }
1769         else if (dword)
1770         {   gentstreg(cdb,reg2);              // TEST reg2,reg2
1771             genjmp(cdb,JNE,FLblock,b.nthSucc(0)); // JNE default
1772         }
1773         if (vmax - vmin != REGMASK)     // if there is a maximum
1774         {                               // CMP reg,vmax-vmin
1775             cdb.genc2(0x81,modregrm(3,7,reg),cast(targ_size_t)(vmax-vmin));
1776             if (I64 && sz == 8)
1777                 code_orrex(cdb.last(), REX_W);
1778             genjmp(cdb,JA,FLblock,b.nthSucc(0));  // JA default
1779         }
1780         if (I64)
1781         {
1782             if (!vmin)
1783             {   // Need to clear out high 32 bits of reg
1784                 // Use 8B instead of 89, as 89 will be optimized away as a NOP
1785                 genregs(cdb,0x8B,reg,reg);                 // MOV reg,reg
1786             }
1787             if (config.flags3 & CFG3pic || config.exe == EX_WIN64)
1788             {
1789                 /* LEA    R1,disp[RIP]          48 8D 05 00 00 00 00
1790                  * MOVSXD R2,[reg*4][R1]        48 63 14 B8
1791                  * LEA    R1,[R1][R2]           48 8D 04 02
1792                  * JMP    R1                    FF E0
1793                  */
1794                 reg_t r1;
1795                 regm_t scratchm = ALLREGS & ~mask(reg);
1796                 allocreg(cdb,&scratchm,&r1,TYint);
1797                 reg_t r2;
1798                 scratchm = ALLREGS & ~(mask(reg) | mask(r1));
1799                 allocreg(cdb,&scratchm,&r2,TYint);
1800 
1801                 CodeBuilder cdbe; cdbe.ctor();
1802                 cdbe.genc1(LEA,(REX_W << 16) | modregxrm(0,r1,5),FLswitch,0);        // LEA R1,disp[RIP]
1803                 cdbe.last().IEV1.Vswitch = b;
1804                 cdbe.gen2sib(0x63,(REX_W << 16) | modregxrm(0,r2,4), modregxrmx(2,reg,r1)); // MOVSXD R2,[reg*4][R1]
1805                 cdbe.gen2sib(LEA,(REX_W << 16) | modregxrm(0,r1,4),modregxrmx(0,r1,r2));    // LEA R1,[R1][R2]
1806                 cdbe.gen2(0xFF,modregrmx(3,4,r1));                                          // JMP R1
1807 
1808                 b.Btablesize = cast(int) (vmax - vmin + 1) * 4;
1809                 code *ce = cdbe.finish();
1810                 pinholeopt(ce, null);
1811 
1812                 cdb.append(cdbe);
1813             }
1814             else
1815             {
1816                 cdb.genc1(0xFF,modregrm(0,4,4),FLswitch,0);   // JMP disp[reg*8]
1817                 cdb.last().IEV1.Vswitch = b;
1818                 cdb.last().Isib = modregrm(3,reg & 7,5);
1819                 if (reg & 8)
1820                     cdb.last().Irex |= REX_X;
1821             }
1822         }
1823         else if (I32)
1824         {
1825 static if (JMPJMPTABLE)
1826 {
1827             /* LEA jreg,offset ctable[reg][reg * 4]
1828                JMP jreg
1829               ctable:
1830                JMP case0
1831                JMP case1
1832                ...
1833              */
1834             CodeBuilder ctable; ctable.ctor();
1835             block *bdef = b.nthSucc(0);
1836             targ_llong u;
1837             for (u = vmin; ; u++)
1838             {   block *targ = bdef;
1839                 foreach (n, val; b.Bswitch)
1840                 {
1841                     if (val == u)
1842                     {   targ = b.nthSucc(n + 1);
1843                         break;
1844                     }
1845                 }
1846                 genjmp(ctable,JMP,FLblock,targ);
1847                 ctable.last().Iflags |= CFjmp5;           // don't shrink these
1848                 if (u == vmax)
1849                     break;
1850             }
1851 
1852             // Allocate scratch register jreg
1853             regm_t scratchm = ALLREGS & ~mask(reg);
1854             uint jreg = AX;
1855             allocreg(cdb,&scratchm,&jreg,TYint);
1856 
1857             // LEA jreg, offset ctable[reg][reg*4]
1858             cdb.genc1(LEA,modregrm(2,jreg,4),FLcode,6);
1859             cdb.last().Isib = modregrm(2,reg,reg);
1860             cdb.gen2(0xFF,modregrm(3,4,jreg));      // JMP jreg
1861             cdb.append(ctable);
1862             b.Btablesize = 0;
1863             cgstate.stackclean--;
1864             return;
1865 }
1866 else
1867 {
1868         if (config.exe & (EX_OSX | EX_OSX64))
1869         {
1870             /*     CALL L1
1871              * L1: POP  R1
1872              *     ADD  R1,disp[reg*4][R1]
1873              *     JMP  R1
1874              */
1875             // Allocate scratch register r1
1876             regm_t scratchm = ALLREGS & ~mask(reg);
1877             reg_t r1;
1878             allocreg(cdb,&scratchm,&r1,TYint);
1879 
1880             cdb.genc2(CALL,0,0);                           //     CALL L1
1881             cdb.gen1(0x58 + r1);                           // L1: POP R1
1882             cdb.genc1(0x03,modregrm(2,r1,4),FLswitch,0);   // ADD R1,disp[reg*4][EBX]
1883             cdb.last().IEV1.Vswitch = b;
1884             cdb.last().Isib = modregrm(2,reg,r1);
1885             cdb.gen2(0xFF,modregrm(3,4,r1));               // JMP R1
1886         }
1887         else
1888         {
1889             if (config.flags3 & CFG3pic)
1890             {
1891                 /* MOV  R1,EBX
1892                  * SUB  R1,funcsym_p@GOTOFF[offset][reg*4][EBX]
1893                  * JMP  R1
1894                  */
1895 
1896                 // Load GOT in EBX
1897                 load_localgot(cdb);
1898 
1899                 // Allocate scratch register r1
1900                 regm_t scratchm = ALLREGS & ~(mask(reg) | mBX);
1901                 reg_t r1;
1902                 allocreg(cdb,&scratchm,&r1,TYint);
1903 
1904                 genmovreg(cdb,r1,BX);              // MOV R1,EBX
1905                 cdb.genc1(0x2B,modregxrm(2,r1,4),FLswitch,0);   // SUB R1,disp[reg*4][EBX]
1906                 cdb.last().IEV1.Vswitch = b;
1907                 cdb.last().Isib = modregrm(2,reg,BX);
1908                 cdb.gen2(0xFF,modregrmx(3,4,r1));               // JMP R1
1909             }
1910             else
1911             {
1912                 cdb.genc1(0xFF,modregrm(0,4,4),FLswitch,0);     // JMP disp[idxreg*4]
1913                 cdb.last().IEV1.Vswitch = b;
1914                 cdb.last().Isib = modregrm(2,reg,5);
1915             }
1916         }
1917 }
1918         }
1919         else if (I16)
1920         {
1921             cdb.gen2(0xD1,modregrm(3,4,reg));                   // SHL reg,1
1922             uint rm = getaddrmode(retregs) | modregrm(0,4,0);
1923             cdb.genc1(0xFF,rm,FLswitch,0);                  // JMP [CS:]disp[idxreg]
1924             cdb.last().IEV1.Vswitch = b;
1925             cdb.last().Iflags |= csseg ? CFcs : 0;                       // segment override
1926         }
1927         else
1928             assert(0);
1929         cgstate.stackclean--;
1930         return;
1931     }
1932 
1933     /*************************************************************************/
1934     {
1935         /* Scan a table of case values, and jump to corresponding address.
1936          * Since it relies on REPNE SCASW, it has really nothing to recommend it
1937          * over Lifthen for 32 and 64 bit code.
1938          * Note that it has not been tested with MACHOBJ (OSX).
1939          */
1940     Lswitch:
1941         regm_t retregs = mAX;                  // SCASW requires AX
1942         if (dword)
1943             retregs |= mDX;
1944         else if (ncases <= 6 || config.flags4 & CFG4speed)
1945             goto Lifthen;
1946         scodelem(cdb,e,&retregs,0,true);
1947         if (dword && mswsame)
1948         {   /* CMP DX,MSW       */
1949             cdb.genc2(0x81,modregrm(3,7,DX),msw);
1950             genjmp(cdb,JNE,FLblock,b.nthSucc(0)); // JNE default
1951         }
1952         getregs(cdb,mCX|mDI);
1953 
1954         if (config.flags3 & CFG3pic && config.exe & EX_posix)
1955         {   // Add in GOT
1956             getregs(cdb,mDX);
1957             cdb.genc2(CALL,0,0);        //     CALL L1
1958             cdb.gen1(0x58 + DI);        // L1: POP EDI
1959 
1960                                         //     ADD EDI,_GLOBAL_OFFSET_TABLE_+3
1961             Symbol *gotsym = Obj.getGOTsym();
1962             cdb.gencs(0x81,modregrm(3,0,DI),FLextern,gotsym);
1963             cdb.last().Iflags = CFoff;
1964             cdb.last().IEV2.Voffset = 3;
1965 
1966             makeitextern(gotsym);
1967 
1968             genmovreg(cdb, DX, DI);    // MOV EDX, EDI
1969                                         // ADD EDI,offset of switch table
1970             cdb.gencs(0x81,modregrm(3,0,DI),FLswitch,null);
1971             cdb.last().IEV2.Vswitch = b;
1972         }
1973 
1974         if (!(config.flags3 & CFG3pic))
1975         {
1976                                         // MOV DI,offset of switch table
1977             cdb.gencs(0xC7,modregrm(3,0,DI),FLswitch,null);
1978             cdb.last().IEV2.Vswitch = b;
1979         }
1980         movregconst(cdb,CX,ncases,0);    // MOV CX,ncases
1981 
1982         /* The switch table will be accessed through ES:DI.
1983          * Therefore, load ES with proper segment value.
1984          */
1985         if (config.flags3 & CFG3eseqds)
1986         {
1987             assert(!csseg);
1988             getregs(cdb,mCX);           // allocate CX
1989         }
1990         else
1991         {
1992             getregs(cdb,mES|mCX);       // allocate ES and CX
1993             cdb.gen1(csseg ? 0x0E : 0x1E);      // PUSH CS/DS
1994             cdb.gen1(0x07);                     // POP  ES
1995         }
1996 
1997         targ_size_t disp = (ncases - 1) * _tysize[TYint];  // displacement to jump table
1998         if (dword && !mswsame)
1999         {
2000 
2001             /* Build the following:
2002                 L1:     SCASW
2003                         JNE     L2
2004                         CMP     DX,[CS:]disp[DI]
2005                 L2:     LOOPNE  L1
2006              */
2007 
2008             const int mod = (disp > 127) ? 2 : 1;         // displacement size
2009             code *cloop = genc2(null,0xE0,0,-7 - mod - csseg);   // LOOPNE scasw
2010             cdb.gen1(0xAF);                                      // SCASW
2011             code_orflag(cdb.last(),CFtarg2);                     // target of jump
2012             genjmp(cdb,JNE,FLcode,cast(block *) cloop); // JNE loop
2013                                                                  // CMP DX,[CS:]disp[DI]
2014             cdb.genc1(0x39,modregrm(mod,DX,5),FLconst,disp);
2015             cdb.last().Iflags |= csseg ? CFcs : 0;              // possible seg override
2016             cdb.append(cloop);
2017             disp += ncases * _tysize[TYint];           // skip over msw table
2018         }
2019         else
2020         {
2021             cdb.gen1(0xF2);              // REPNE
2022             cdb.gen1(0xAF);              // SCASW
2023         }
2024         genjmp(cdb,JNE,FLblock,b.nthSucc(0)); // JNE default
2025         const int mod = (disp > 127) ? 2 : 1;     // 1 or 2 byte displacement
2026         if (csseg)
2027             cdb.gen1(SEGCS);            // table is in code segment
2028 
2029         if (config.flags3 & CFG3pic &&
2030             config.exe & EX_posix)
2031         {                               // ADD EDX,(ncases-1)*2[EDI]
2032             cdb.genc1(0x03,modregrm(mod,DX,7),FLconst,disp);
2033                                         // JMP EDX
2034             cdb.gen2(0xFF,modregrm(3,4,DX));
2035         }
2036 
2037         if (!(config.flags3 & CFG3pic))
2038         {                               // JMP (ncases-1)*2[DI]
2039             cdb.genc1(0xFF,modregrm(mod,4,(I32 ? 7 : 5)),FLconst,disp);
2040             cdb.last().Iflags |= csseg ? CFcs : 0;
2041         }
2042         b.Btablesize = disp + _tysize[TYint] + ncases * tysize(TYnptr);
2043         //assert(b.Bcode);
2044         cgstate.stackclean--;
2045         return;
2046     }
2047 }
2048 
2049 /******************************
2050  * Output data block for a jump table (BCjmptab).
2051  * The 'holes' in the table get filled with the
2052  * default label.
2053  */
2054 
2055 @trusted
2056 void outjmptab(block *b)
2057 {
2058     if (JMPJMPTABLE && I32)
2059         return;
2060 
2061     const ncases = b.Bswitch.length;        // number of cases
2062 
2063     /* Find vmin and vmax, the range of the table will be [vmin .. vmax + 1]
2064      * Must be same computation as used in doswitch().
2065      */
2066     targ_llong vmax = long.min;              // smallest possible llong
2067     targ_llong vmin = long.max;              // largest possible llong
2068     foreach (val; b.Bswitch)                 // find min case value
2069     {
2070         if (val > vmax) vmax = val;
2071         if (val < vmin) vmin = val;
2072     }
2073     if (vmin > 0 && vmin <= _tysize[TYint])
2074         vmin = 0;
2075     assert(vmin <= vmax);
2076 
2077     /* Segment and offset into which the jump table will be emitted
2078      */
2079     int jmpseg = objmod.jmpTableSegment(funcsym_p);
2080     targ_size_t *poffset = &Offset(jmpseg);
2081 
2082     /* Align start of jump table
2083      */
2084     targ_size_t alignbytes = _align(0,*poffset) - *poffset;
2085     objmod.lidata(jmpseg,*poffset,alignbytes);
2086     assert(*poffset == b.Btableoffset);        // should match precomputed value
2087 
2088     Symbol *gotsym = null;
2089     targ_size_t def = b.nthSucc(0).Boffset;  // default address
2090     for (targ_llong u = vmin; ; u++)
2091     {   targ_size_t targ = def;                     // default
2092         foreach (n; 0 .. ncases)
2093         {
2094             if (b.Bswitch[n] == u)
2095             {
2096                 targ = b.nthSucc(cast(int)(n + 1)).Boffset;
2097                 break;
2098             }
2099         }
2100         if (config.exe & (EX_LINUX64 | EX_FREEBSD64 | EX_OPENBSD64 | EX_DRAGONFLYBSD64 | EX_SOLARIS64))
2101         {
2102             if (config.flags3 & CFG3pic)
2103             {
2104                 objmod.reftodatseg(jmpseg,*poffset,cast(targ_size_t)(targ + (u - vmin) * 4),funcsym_p.Sseg,CFswitch);
2105                 *poffset += 4;
2106             }
2107             else
2108             {
2109                 objmod.reftodatseg(jmpseg,*poffset,targ,funcsym_p.Sxtrnnum,CFoffset64 | CFswitch);
2110                 *poffset += 8;
2111             }
2112         }
2113         else if (config.exe & (EX_LINUX | EX_FREEBSD | EX_OPENBSD | EX_SOLARIS))
2114         {
2115             if (config.flags3 & CFG3pic)
2116             {
2117                 assert(config.flags & CFGromable);
2118                 // Want a GOTPC fixup to _GLOBAL_OFFSET_TABLE_
2119                 if (!gotsym)
2120                     gotsym = Obj.getGOTsym();
2121                 objmod.reftoident(jmpseg,*poffset,gotsym,*poffset - targ,CFswitch);
2122             }
2123             else
2124                 objmod.reftocodeseg(jmpseg,*poffset,targ);
2125             *poffset += 4;
2126         }
2127         else if (config.exe & (EX_OSX | EX_OSX64) || I64)
2128         {
2129             const val = cast(uint)(targ - (I64 ? b.Btableoffset : b.Btablebase));
2130             objmod.write_bytes(SegData[jmpseg],(&val)[0 .. 1]);
2131         }
2132         else
2133         {
2134             objmod.reftocodeseg(jmpseg,*poffset,targ);
2135             *poffset += tysize(TYnptr);
2136         }
2137 
2138         if (u == vmax)                  // for case that (vmax == ~0)
2139             break;
2140     }
2141 }
2142 
2143 
2144 /******************************
2145  * Output data block for a switch table.
2146  * Two consecutive tables, the first is the case value table, the
2147  * second is the address table.
2148  */
2149 
2150 @trusted
2151 void outswitab(block *b)
2152 {
2153     //printf("outswitab()\n");
2154     const ncases = b.Bswitch.length;     // number of cases
2155 
2156     const int seg = objmod.jmpTableSegment(funcsym_p);
2157     targ_size_t *poffset = &Offset(seg);
2158     targ_size_t offset = *poffset;
2159     targ_size_t alignbytes = _align(0,*poffset) - *poffset;
2160     objmod.lidata(seg,*poffset,alignbytes);  // any alignment bytes necessary
2161     assert(*poffset == offset + alignbytes);
2162 
2163     uint sz = _tysize[TYint];
2164     assert(SegData[seg].SDseg == seg);
2165     foreach (val; b.Bswitch)          // send out value table
2166     {
2167         //printf("\tcase %d, offset = x%x\n", n, *poffset);
2168         objmod.write_bytes(SegData[seg],(cast(void*)&val)[0 .. sz]);
2169     }
2170     offset += alignbytes + sz * ncases;
2171     assert(*poffset == offset);
2172 
2173     if (b.Btablesize == ncases * (REGSIZE * 2 + tysize(TYnptr)))
2174     {
2175         // Send out MSW table
2176         foreach (val; b.Bswitch)
2177         {
2178             auto msval = cast(targ_size_t)MSREG(val);
2179             objmod.write_bytes(SegData[seg],(cast(void*)&msval)[0 .. REGSIZE]);
2180         }
2181         offset += REGSIZE * ncases;
2182         assert(*poffset == offset);
2183     }
2184 
2185     list_t bl = b.Bsucc;
2186     foreach (n; 0 .. ncases)          // send out address table
2187     {
2188         bl = list_next(bl);
2189         objmod.reftocodeseg(seg,*poffset,list_block(bl).Boffset);
2190         *poffset += tysize(TYnptr);
2191     }
2192     assert(*poffset == offset + ncases * tysize(TYnptr));
2193 }
2194 
2195 /*****************************
2196  * Return a jump opcode relevant to the elem for a JMP true.
2197  */
2198 
2199 @trusted
2200 int jmpopcode(elem *e)
2201 {
2202     //printf("jmpopcode()\n"); elem_print(e);
2203     tym_t tym;
2204     int zero,i,jp,op;
2205     static immutable ubyte[6][2][2] jops =
2206     [   /* <=  >   <   >=  ==  !=    <=0 >0  <0  >=0 ==0 !=0    */
2207        [ [JLE,JG ,JL ,JGE,JE ,JNE],[JLE,JG ,JS ,JNS,JE ,JNE] ], /* signed   */
2208        [ [JBE,JA ,JB ,JAE,JE ,JNE],[JE ,JNE,JB ,JAE,JE ,JNE] ], /* uint */
2209 /+
2210        [ [JLE,JG ,JL ,JGE,JE ,JNE],[JLE,JG ,JL ,JGE,JE ,JNE] ], /* real     */
2211        [ [JBE,JA ,JB ,JAE,JE ,JNE],[JBE,JA ,JB ,JAE,JE ,JNE] ], /* 8087     */
2212        [ [JA ,JBE,JAE,JB ,JE ,JNE],[JBE,JA ,JB ,JAE,JE ,JNE] ], /* 8087 R   */
2213 +/
2214     ];
2215 
2216     enum
2217     {
2218         XP     = (JP  << 8),
2219         XNP    = (JNP << 8),
2220     }
2221     static immutable uint[26][1] jfops =
2222     /*   le     gt lt     ge  eqeq    ne     unord lg  leg  ule ul uge  */
2223     [
2224       [ XNP|JBE,JA,XNP|JB,JAE,XNP|JE, XP|JNE,JP,   JNE,JNP, JBE,JC,XP|JAE,
2225 
2226     /*  ug    ue ngt nge nlt    nle    ord nlg nleg nule nul nuge    nug     nue */
2227         XP|JA,JE,JBE,JB, XP|JAE,XP|JA, JNP,JE, JP,  JA,  JNC,XNP|JB, XNP|JBE,JNE        ], /* 8087     */
2228     ];
2229 
2230     assert(e);
2231     while (e.Eoper == OPcomma ||
2232         /* The OTleaf(e.EV.E1.Eoper) is to line up with the case in cdeq() where  */
2233         /* we decide if mPSW is passed on when evaluating E2 or not.    */
2234          (e.Eoper == OPeq && OTleaf(e.EV.E1.Eoper)))
2235     {
2236         e = e.EV.E2;                      /* right operand determines it  */
2237     }
2238 
2239     op = e.Eoper;
2240     tym_t tymx = tybasic(e.Ety);
2241     bool needsNanCheck = tyfloating(tymx) && config.inline8087 &&
2242         (tymx == TYldouble || tymx == TYildouble || tymx == TYcldouble ||
2243          tymx == TYcdouble || tymx == TYcfloat ||
2244          (tyxmmreg(tymx) && config.fpxmmregs && e.Ecount != e.Ecomsub) ||
2245          op == OPind ||
2246          (OTcall(op) && (regmask(tymx, tybasic(e.EV.E1.Eoper)) & (mST0 | XMMREGS))));
2247 
2248     if (!needsNanCheck)
2249     {
2250         /* If e is in an XMM register, need to use XP.
2251          * Match same test in loaddata()
2252          */
2253         Symbol* s;
2254         needsNanCheck = e.Eoper == OPvar &&
2255             (s = e.EV.Vsym).Sfl == FLreg &&
2256              s.Sregm & XMMREGS &&
2257              (tymx == TYfloat || tymx == TYifloat || tymx == TYdouble || tymx ==TYidouble);
2258     }
2259 
2260     if (e.Ecount != e.Ecomsub)          // comsubs just get Z bit set
2261     {
2262         if (needsNanCheck) // except for floating point values that need a NaN check
2263             return XP|JNE;
2264         else
2265             return JNE;
2266     }
2267     if (!OTrel(op))                       // not relational operator
2268     {
2269         if (needsNanCheck)
2270             return XP|JNE;
2271 
2272         if (op == OPu32_64) { e = e.EV.E1; op = e.Eoper; }
2273         if (op == OPu16_32) { e = e.EV.E1; op = e.Eoper; }
2274         if (op == OPu8_16) op = e.EV.E1.Eoper;
2275         return ((op >= OPbt && op <= OPbts) || op == OPbtst) ? JC : JNE;
2276     }
2277 
2278     if (e.EV.E2.Eoper == OPconst)
2279         zero = !boolres(e.EV.E2);
2280     else
2281         zero = 0;
2282 
2283     tym = e.EV.E1.Ety;
2284     if (tyfloating(tym))
2285     {
2286 static if (1)
2287 {
2288         i = 0;
2289         if (config.inline8087)
2290         {   i = 1;
2291 
2292 static if (1)
2293 {
2294             if (rel_exception(op) || config.flags4 & CFG4fastfloat)
2295             {
2296                 const bool NOSAHF = (I64 || config.fpxmmregs);
2297                 if (zero)
2298                 {
2299                     if (NOSAHF)
2300                         op = swaprel(op);
2301                 }
2302                 else if (NOSAHF)
2303                     op = swaprel(op);
2304                 else if (cmporder87(e.EV.E2))
2305                     op = swaprel(op);
2306                 else
2307                 { }
2308             }
2309             else
2310             {
2311                 if (zero && config.target_cpu < TARGET_80386)
2312                 { }
2313                 else
2314                     op = swaprel(op);
2315             }
2316 }
2317 else
2318 {
2319             if (zero && !rel_exception(op) && config.target_cpu >= TARGET_80386)
2320                 op = swaprel(op);
2321             else if (!zero &&
2322                 (cmporder87(e.EV.E2) || !(rel_exception(op) || config.flags4 & CFG4fastfloat)))
2323                 /* compare is reversed */
2324                 op = swaprel(op);
2325 }
2326         }
2327         jp = jfops[0][op - OPle];
2328         goto L1;
2329 }
2330 else
2331 {
2332         i = (config.inline8087) ? (3 + cmporder87(e.EV.E2)) : 2;
2333 }
2334     }
2335     else if (tyuns(tym) || tyuns(e.EV.E2.Ety))
2336         i = 1;
2337     else if (tyintegral(tym) || typtr(tym))
2338         i = 0;
2339     else
2340     {
2341         debug
2342         elem_print(e);
2343         printf("%s\n", tym_str(tym));
2344         assert(0);
2345     }
2346 
2347     jp = jops[i][zero][op - OPle];        /* table starts with OPle       */
2348 
2349     /* Try to rewrite uint comparisons so they rely on just the Carry flag
2350      */
2351     if (i == 1 && (jp == JA || jp == JBE) &&
2352         (e.EV.E2.Eoper != OPconst && e.EV.E2.Eoper != OPrelconst))
2353     {
2354         jp = (jp == JA) ? JC : JNC;
2355     }
2356 
2357 L1:
2358     debug
2359     if ((jp & 0xF0) != 0x70)
2360     {
2361         printf("%s i %d zero %d op x%x jp x%x\n",oper_str(op),i,zero,op,jp);
2362     }
2363 
2364     assert((jp & 0xF0) == 0x70);
2365     return jp;
2366 }
2367 
2368 /**********************************
2369  * Append code to cdb which validates pointer described by
2370  * addressing mode in *pcs. Modify addressing mode in *pcs.
2371  * Params:
2372  *    cdb = append generated code to this
2373  *    pcs = original addressing mode to be updated
2374  *    keepmsk = mask of registers we must not destroy or use
2375  *              if (keepmsk & RMstore), this will be only a store operation
2376  *              into the lvalue
2377  */
2378 
2379 @trusted
2380 void cod3_ptrchk(ref CodeBuilder cdb,code *pcs,regm_t keepmsk)
2381 {
2382     ubyte sib;
2383     reg_t reg;
2384     uint flagsave;
2385 
2386     assert(!I64);
2387     if (!I16 && pcs.Iflags & (CFes | CFss | CFcs | CFds | CFfs | CFgs))
2388         return;         // not designed to deal with 48 bit far pointers
2389 
2390     ubyte rm = pcs.Irm;
2391     assert(!(rm & 0x40));       // no disp8 or reg addressing modes
2392 
2393     // If the addressing mode is already a register
2394     reg = rm & 7;
2395     if (I16)
2396     {   static immutable ubyte[8] imode = [ BP,BP,BP,BP,SI,DI,BP,BX ];
2397 
2398         reg = imode[reg];               // convert [SI] to SI, etc.
2399     }
2400     regm_t idxregs = mask(reg);
2401     if ((rm & 0x80 && (pcs.IFL1 != FLoffset || pcs.IEV1.Vuns)) ||
2402         !(idxregs & ALLREGS)
2403        )
2404     {
2405         // Load the offset into a register, so we can push the address
2406         regm_t idxregs2 = (I16 ? IDXREGS : ALLREGS) & ~keepmsk; // only these can be index regs
2407         assert(idxregs2);
2408         allocreg(cdb,&idxregs2,&reg,TYoffset);
2409 
2410         const opsave = pcs.Iop;
2411         flagsave = pcs.Iflags;
2412         pcs.Iop = LEA;
2413         pcs.Irm |= modregrm(0,reg,0);
2414         pcs.Iflags &= ~(CFopsize | CFss | CFes | CFcs);        // no prefix bytes needed
2415         cdb.gen(pcs);                 // LEA reg,EA
2416 
2417         pcs.Iflags = flagsave;
2418         pcs.Iop = opsave;
2419     }
2420 
2421     // registers destroyed by the function call
2422     //used = (mBP | ALLREGS | mES) & ~fregsaved;
2423     regm_t used = 0;                           // much less code generated this way
2424 
2425     code *cs2 = null;
2426     regm_t tosave = used & (keepmsk | idxregs);
2427     for (int i = 0; tosave; i++)
2428     {
2429         regm_t mi = mask(i);
2430 
2431         assert(i < REGMAX);
2432         if (mi & tosave)        /* i = register to save                 */
2433         {
2434             int push,pop;
2435 
2436             stackchanged = 1;
2437             if (i == ES)
2438             {   push = 0x06;
2439                 pop = 0x07;
2440             }
2441             else
2442             {   push = 0x50 + i;
2443                 pop = push | 8;
2444             }
2445             cdb.gen1(push);                     // PUSH i
2446             cs2 = cat(gen1(null,pop),cs2);      // POP i
2447             tosave &= ~mi;
2448         }
2449     }
2450 
2451     // For 16 bit models, push a far pointer
2452     if (I16)
2453     {
2454         int segreg;
2455 
2456         switch (pcs.Iflags & (CFes | CFss | CFcs | CFds | CFfs | CFgs))
2457         {   case CFes:  segreg = 0x06;  break;
2458             case CFss:  segreg = 0x16;  break;
2459             case CFcs:  segreg = 0x0E;  break;
2460             case 0:     segreg = 0x1E;  break;  // DS
2461             default:
2462                 assert(0);
2463         }
2464 
2465         // See if we should default to SS:
2466         // (Happens when BP is part of the addressing mode)
2467         if (segreg == 0x1E && (rm & 0xC0) != 0xC0 &&
2468             rm & 2 && (rm & 7) != 7)
2469         {
2470             segreg = 0x16;
2471             if (config.wflags & WFssneds)
2472                 pcs.Iflags |= CFss;    // because BP won't be there anymore
2473         }
2474         cdb.gen1(segreg);               // PUSH segreg
2475     }
2476 
2477     cdb.gen1(0x50 + reg);               // PUSH reg
2478 
2479     // Rewrite the addressing mode in *pcs so it is just 0[reg]
2480     setaddrmode(pcs, idxregs);
2481     pcs.IFL1 = FLoffset;
2482     pcs.IEV1.Vuns = 0;
2483 
2484     // Call the validation function
2485     {
2486         makeitextern(getRtlsym(RTLSYM.PTRCHK));
2487 
2488         used &= ~(keepmsk | idxregs);           // regs destroyed by this exercise
2489         getregs(cdb,used);
2490                                                 // CALL __ptrchk
2491         cdb.gencs((LARGECODE) ? 0x9A : CALL,0,FLfunc,getRtlsym(RTLSYM.PTRCHK));
2492     }
2493 
2494     cdb.append(cs2);
2495 }
2496 
2497 /***********************************
2498  * Determine if BP can be used as a general purpose register.
2499  * Note parallels between this routine and prolog().
2500  * Returns:
2501  *      0       can't be used, needed for frame
2502  *      mBP     can be used
2503  */
2504 
2505 @trusted
2506 regm_t cod3_useBP()
2507 {
2508     tym_t tym;
2509     tym_t tyf;
2510 
2511     // Note that DOSX memory model cannot use EBP as a general purpose
2512     // register, as SS != DS.
2513     if (!(config.exe & EX_flat) || config.flags & (CFGalwaysframe | CFGnoebp))
2514         goto Lcant;
2515 
2516     if (anyiasm)
2517         goto Lcant;
2518 
2519     tyf = funcsym_p.ty();
2520     if (tyf & mTYnaked)                 // if no prolog/epilog for function
2521         goto Lcant;
2522 
2523     if (funcsym_p.Sfunc.Fflags3 & Ffakeeh)
2524     {
2525         goto Lcant;                     // need consistent stack frame
2526     }
2527 
2528     tym = tybasic(tyf);
2529     if (tym == TYifunc)
2530         goto Lcant;
2531 
2532     stackoffsets(globsym, true);                // estimate stack offsets
2533     localsize = Auto.offset + Fast.offset;                // an estimate only
2534 //    if (localsize)
2535     {
2536         if (!(config.flags4 & CFG4speed) ||
2537             config.target_cpu < TARGET_Pentium ||
2538             tyfarfunc(tym) ||
2539             config.flags & CFGstack ||
2540             localsize >= 0x100 ||       // arbitrary value < 0x1000
2541             (usednteh & (NTEH_try | NTEH_except | NTEHcpp | EHcleanup | EHtry | NTEHpassthru)) ||
2542             calledFinally ||
2543             Alloca.size
2544            )
2545             goto Lcant;
2546     }
2547     return mBP;
2548 
2549 Lcant:
2550     return 0;
2551 }
2552 
2553 /*************************************************
2554  * Generate code segment to be used later to restore a cse
2555  */
2556 
2557 @trusted
2558 bool cse_simple(code *c, elem *e)
2559 {
2560     regm_t regm;
2561     reg_t reg;
2562     int sz = tysize(e.Ety);
2563 
2564     if (!I16 &&                                  // don't bother with 16 bit code
2565         e.Eoper == OPadd &&
2566         sz == REGSIZE &&
2567         e.EV.E2.Eoper == OPconst &&
2568         e.EV.E1.Eoper == OPvar &&
2569         isregvar(e.EV.E1,regm,reg) &&
2570         !(e.EV.E1.EV.Vsym.Sflags & SFLspill)
2571        )
2572     {
2573         memset(c,0,(*c).sizeof);
2574 
2575         // Make this an LEA instruction
2576         c.Iop = LEA;
2577         buildEA(c,reg,-1,1,e.EV.E2.EV.Vuns);
2578         if (I64)
2579         {   if (sz == 8)
2580                 c.Irex |= REX_W;
2581         }
2582 
2583         return true;
2584     }
2585     else if (e.Eoper == OPind &&
2586         sz <= REGSIZE &&
2587         e.EV.E1.Eoper == OPvar &&
2588         isregvar(e.EV.E1,regm,reg) &&
2589         (I32 || I64 || regm & IDXREGS) &&
2590         !(e.EV.E1.EV.Vsym.Sflags & SFLspill)
2591        )
2592     {
2593         memset(c,0,(*c).sizeof);
2594 
2595         // Make this a MOV instruction
2596         c.Iop = (sz == 1) ? 0x8A : 0x8B;       // MOV reg,EA
2597         buildEA(c,reg,-1,1,0);
2598         if (sz == 2 && I32)
2599             c.Iflags |= CFopsize;
2600         else if (I64)
2601         {   if (sz == 8)
2602                 c.Irex |= REX_W;
2603         }
2604 
2605         return true;
2606     }
2607     return false;
2608 }
2609 
2610 /**************************
2611  * Store `reg` to the common subexpression save area in index `slot`.
2612  * Params:
2613  *      cdb = where to write code to
2614  *      tym = type of value that's in `reg`
2615  *      reg = register to save
2616  *      slot = index into common subexpression save area
2617  */
2618 @trusted
2619 void gen_storecse(ref CodeBuilder cdb, tym_t tym, reg_t reg, size_t slot)
2620 {
2621     // MOV slot[BP],reg
2622     if (isXMMreg(reg) && config.fpxmmregs) // watch out for ES
2623     {
2624         const aligned = tyvector(tym) ? STACKALIGN >= 16 : true;
2625         const op = xmmstore(tym, aligned);
2626         cdb.genc1(op,modregxrm(2, reg - XMM0, BPRM),FLcs,cast(targ_size_t)slot);
2627         return;
2628     }
2629     opcode_t op = STO;              // normal mov
2630     if (reg == ES)
2631     {
2632         reg = 0;            // the real reg number
2633         op = 0x8C;          // segment reg mov
2634     }
2635     cdb.genc1(op,modregxrm(2, reg, BPRM),FLcs,cast(targ_uns)slot);
2636     if (I64)
2637         code_orrex(cdb.last(), REX_W);
2638 }
2639 
2640 @trusted
2641 void gen_testcse(ref CodeBuilder cdb, tym_t tym, uint sz, size_t slot)
2642 {
2643     // CMP slot[BP],0
2644     cdb.genc(sz == 1 ? 0x80 : 0x81,modregrm(2,7,BPRM),
2645                 FLcs,cast(targ_uns)slot, FLconst,cast(targ_uns) 0);
2646     if ((I64 || I32) && sz == 2)
2647         cdb.last().Iflags |= CFopsize;
2648     if (I64 && sz == 8)
2649         code_orrex(cdb.last(), REX_W);
2650 }
2651 
2652 @trusted
2653 void gen_loadcse(ref CodeBuilder cdb, tym_t tym, reg_t reg, size_t slot)
2654 {
2655     // MOV reg,slot[BP]
2656     if (isXMMreg(reg) && config.fpxmmregs)
2657     {
2658         const aligned = tyvector(tym) ? STACKALIGN >= 16 : true;
2659         const op = xmmload(tym, aligned);
2660         cdb.genc1(op,modregxrm(2, reg - XMM0, BPRM),FLcs,cast(targ_size_t)slot);
2661         return;
2662     }
2663     opcode_t op = LOD;
2664     if (reg == ES)
2665     {
2666         op = 0x8E;
2667         reg = 0;
2668     }
2669     cdb.genc1(op,modregxrm(2,reg,BPRM),FLcs,cast(targ_uns)slot);
2670     if (I64)
2671         code_orrex(cdb.last(), REX_W);
2672 }
2673 
2674 /***************************************
2675  * Gen code for OPframeptr
2676  */
2677 
2678 @trusted
2679 void cdframeptr(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
2680 {
2681     regm_t retregs = *pretregs & allregs;
2682     if  (!retregs)
2683         retregs = allregs;
2684     reg_t reg;
2685     allocreg(cdb,&retregs, &reg, TYint);
2686 
2687     code cs;
2688     cs.Iop = ESCAPE | ESCframeptr;
2689     cs.Iflags = 0;
2690     cs.Irex = 0;
2691     cs.Irm = cast(ubyte)reg;
2692     cdb.gen(&cs);
2693     fixresult(cdb,e,retregs,pretregs);
2694 }
2695 
2696 /***************************************
2697  * Gen code for load of _GLOBAL_OFFSET_TABLE_.
2698  * This value gets cached in the local variable 'localgot'.
2699  */
2700 
2701 @trusted
2702 void cdgot(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
2703 {
2704     if (config.exe & (EX_OSX | EX_OSX64))
2705     {
2706         regm_t retregs = *pretregs & allregs;
2707         if  (!retregs)
2708             retregs = allregs;
2709         reg_t reg;
2710         allocreg(cdb,&retregs, &reg, TYnptr);
2711 
2712         cdb.genc(CALL,0,0,0,FLgot,0);     //     CALL L1
2713         cdb.gen1(0x58 + reg);             // L1: POP reg
2714 
2715         fixresult(cdb,e,retregs,pretregs);
2716     }
2717     else if (config.exe & EX_posix)
2718     {
2719         regm_t retregs = *pretregs & allregs;
2720         if  (!retregs)
2721             retregs = allregs;
2722         reg_t reg;
2723         allocreg(cdb,&retregs, &reg, TYnptr);
2724 
2725         cdb.genc2(CALL,0,0);        //     CALL L1
2726         cdb.gen1(0x58 + reg);       // L1: POP reg
2727 
2728                                     //     ADD reg,_GLOBAL_OFFSET_TABLE_+3
2729         Symbol *gotsym = Obj.getGOTsym();
2730         cdb.gencs(0x81,modregrm(3,0,reg),FLextern,gotsym);
2731         /* Because the 2:3 offset from L1: is hardcoded,
2732          * this sequence of instructions must not
2733          * have any instructions in between,
2734          * so set CFvolatile to prevent the scheduler from rearranging it.
2735          */
2736         code *cgot = cdb.last();
2737         cgot.Iflags = CFoff | CFvolatile;
2738         cgot.IEV2.Voffset = (reg == AX) ? 2 : 3;
2739 
2740         makeitextern(gotsym);
2741         fixresult(cdb,e,retregs,pretregs);
2742     }
2743     else
2744         assert(0);
2745 }
2746 
2747 /**************************************************
2748  * Load contents of localgot into EBX.
2749  */
2750 
2751 @trusted
2752 void load_localgot(ref CodeBuilder cdb)
2753 {
2754     if (config.exe & (EX_LINUX | EX_FREEBSD | EX_OPENBSD | EX_SOLARIS)) // note: I32 only
2755     {
2756         if (config.flags3 & CFG3pic)
2757         {
2758             if (localgot && !(localgot.Sflags & SFLdead))
2759             {
2760                 localgot.Sflags &= ~GTregcand;     // because this hack doesn't work with reg allocator
2761                 elem *e = el_var(localgot);
2762                 regm_t retregs = mBX;
2763                 codelem(cdb,e,&retregs,false);
2764                 el_free(e);
2765             }
2766             else
2767             {
2768                 elem *e = el_long(TYnptr, 0);
2769                 e.Eoper = OPgot;
2770                 regm_t retregs = mBX;
2771                 codelem(cdb,e,&retregs,false);
2772                 el_free(e);
2773             }
2774         }
2775     }
2776 }
2777 
2778 /*****************************
2779  * Returns:
2780  *      # of bytes stored
2781  */
2782 
2783 
2784 @trusted
2785 int obj_namestring(char *p,const(char)* name)
2786 {
2787     size_t len = strlen(name);
2788     if (len > 255)
2789     {
2790         short *ps = cast(short *)p;
2791         p[0] = 0xFF;
2792         p[1] = 0;
2793         ps[1] = cast(short)len;
2794         memcpy(p + 4,name,len);
2795         const int ONS_OHD = 4;           // max # of extra bytes added by obj_namestring()
2796         len += ONS_OHD;
2797     }
2798     else
2799     {
2800         p[0] = cast(char)len;
2801         memcpy(p + 1,name,len);
2802         len++;
2803     }
2804     return cast(int)len;
2805 }
2806 
2807 void genregs(ref CodeBuilder cdb,opcode_t op,uint dstreg,uint srcreg)
2808 {
2809     return cdb.gen2(op,modregxrmx(3,dstreg,srcreg));
2810 }
2811 
2812 void gentstreg(ref CodeBuilder cdb, uint t)
2813 {
2814     cdb.gen2(0x85,modregxrmx(3,t,t));   // TEST t,t
2815     code_orflag(cdb.last(),CFpsw);
2816 }
2817 
2818 void genpush(ref CodeBuilder cdb, reg_t reg)
2819 {
2820     cdb.gen1(0x50 + (reg & 7));
2821     if (reg & 8)
2822         code_orrex(cdb.last(), REX_B);
2823 }
2824 
2825 void genpop(ref CodeBuilder cdb, reg_t reg)
2826 {
2827     cdb.gen1(0x58 + (reg & 7));
2828     if (reg & 8)
2829         code_orrex(cdb.last(), REX_B);
2830 }
2831 
2832 /**************************
2833  * Generate a MOV to,from register instruction.
2834  * Smart enough to dump redundant register moves, and segment
2835  * register moves.
2836  */
2837 
2838 code *genmovreg(uint to,uint from)
2839 {
2840     CodeBuilder cdb; cdb.ctor();
2841     genmovreg(cdb, to, from);
2842     return cdb.finish();
2843 }
2844 
2845 void genmovreg(ref CodeBuilder cdb,uint to,uint from)
2846 {
2847     genmovreg(cdb, to, from, TYMAX);
2848 }
2849 
2850 @trusted
2851 void genmovreg(ref CodeBuilder cdb, uint to, uint from, tym_t tym)
2852 {
2853     // register kind. ex: GPR,XMM,SEG
2854     static uint _K(uint reg)
2855     {
2856         switch (reg)
2857         {
2858         case ES:                   return ES;
2859         case XMM15:
2860         case XMM0: .. case XMM7:   return XMM0;
2861         case AX:   .. case R15:    return AX;
2862         default:                   return reg;
2863         }
2864     }
2865 
2866     // kind combination (order kept)
2867     static uint _X(uint to, uint from) { return (_K(to) << 8) + _K(from); }
2868 
2869     if (to != from)
2870     {
2871         if (tym == TYMAX) tym = TYsize_t; // avoid register slicing
2872         switch (_X(to, from))
2873         {
2874             case _X(AX, AX):
2875                 genregs(cdb, 0x89, from, to);    // MOV to,from
2876                 if (I64 && tysize(tym) >= 8)
2877                     code_orrex(cdb.last(), REX_W);
2878                 break;
2879 
2880             case _X(XMM0, XMM0):             // MOVD/Q to,from
2881                 genregs(cdb, xmmload(tym), to-XMM0, from-XMM0);
2882                 checkSetVex(cdb.last(), tym);
2883                 break;
2884 
2885             case _X(AX, XMM0):               // MOVD/Q to,from
2886                 genregs(cdb, STOD, from-XMM0, to);
2887                 if (I64 && tysize(tym) >= 8)
2888                     code_orrex(cdb.last(), REX_W);
2889                 checkSetVex(cdb.last(), tym);
2890                 break;
2891 
2892             case _X(XMM0, AX):               // MOVD/Q to,from
2893                 genregs(cdb, LODD, to-XMM0, from);
2894                 if (I64 && tysize(tym) >= 8)
2895                     code_orrex(cdb.last(),  REX_W);
2896                 checkSetVex(cdb.last(), tym);
2897                 break;
2898 
2899             case _X(ES, AX):
2900                 assert(tysize(tym) <= REGSIZE);
2901                 genregs(cdb, 0x8E, 0, from);
2902                 break;
2903 
2904             case _X(AX, ES):
2905                 assert(tysize(tym) <= REGSIZE);
2906                 genregs(cdb, 0x8C, 0, to);
2907                 break;
2908 
2909             default:
2910                 debug printf("genmovreg(to = %s, from = %s)\n"
2911                     , regm_str(mask(to)), regm_str(mask(from)));
2912                 assert(0);
2913         }
2914     }
2915 }
2916 
2917 /***************************************
2918  * Generate immediate multiply instruction for r1=r2*imm.
2919  * Optimize it into LEA's if we can.
2920  */
2921 
2922 @trusted
2923 void genmulimm(ref CodeBuilder cdb,uint r1,uint r2,targ_int imm)
2924 {
2925     // These optimizations should probably be put into pinholeopt()
2926     switch (imm)
2927     {
2928         case 1:
2929             genmovreg(cdb,r1,r2);
2930             break;
2931 
2932         case 5:
2933         {
2934             code cs;
2935             cs.Iop = LEA;
2936             cs.Iflags = 0;
2937             cs.Irex = 0;
2938             buildEA(&cs,r2,r2,4,0);
2939             cs.orReg(r1);
2940             cdb.gen(&cs);
2941             break;
2942         }
2943 
2944         default:
2945             cdb.genc2(0x69,modregxrmx(3,r1,r2),imm);    // IMUL r1,r2,imm
2946             break;
2947     }
2948 }
2949 
2950 /******************************
2951  * Load CX with the value of _AHSHIFT.
2952  */
2953 
2954 void genshift(ref CodeBuilder cdb)
2955 {
2956     assert(0);
2957 }
2958 
2959 /******************************
2960  * Move constant value into reg.
2961  * Take advantage of existing values in registers.
2962  * If flags & mPSW
2963  *      set flags based on result
2964  * Else if flags & 8
2965  *      do not disturb flags
2966  * Else
2967  *      don't care about flags
2968  * If flags & 1 then byte move
2969  * If flags & 2 then short move (for I32 and I64)
2970  * If flags & 4 then don't disturb unused portion of register
2971  * If flags & 16 then reg is a byte register AL..BH
2972  * If flags & 64 (0x40) then 64 bit move (I64 only)
2973  * Returns:
2974  *      code (if any) generated
2975  */
2976 
2977 @trusted
2978 void movregconst(ref CodeBuilder cdb,reg_t reg,targ_size_t value,regm_t flags)
2979 {
2980     reg_t r;
2981     regm_t mreg;
2982 
2983     //printf("movregconst(reg=%s, value= %lld (%llx), flags=%x)\n", regm_str(mask(reg)), value, value, flags);
2984 
2985     regm_t regm = regcon.immed.mval & mask(reg);
2986     targ_size_t regv = regcon.immed.value[reg];
2987 
2988     if (flags & 1)      // 8 bits
2989     {
2990         value &= 0xFF;
2991         regm &= BYTEREGS;
2992 
2993         // If we already have the right value in the right register
2994         if (regm && (regv & 0xFF) == value)
2995             goto L2;
2996 
2997         if (flags & 16 && reg & 4 &&    // if an H byte register
2998             regcon.immed.mval & mask(reg & 3) &&
2999             (((regv = regcon.immed.value[reg & 3]) >> 8) & 0xFF) == value)
3000             goto L2;
3001 
3002         /* Avoid byte register loads to avoid dependency stalls.
3003          */
3004         if ((I32 || I64) &&
3005             config.target_cpu >= TARGET_PentiumPro && !(flags & 4))
3006             goto L3;
3007 
3008         // See if another register has the right value
3009         r = 0;
3010         for (mreg = (regcon.immed.mval & BYTEREGS); mreg; mreg >>= 1)
3011         {
3012             if (mreg & 1)
3013             {
3014                 if ((regcon.immed.value[r] & 0xFF) == value)
3015                 {
3016                     genregs(cdb,0x8A,reg,r);          // MOV regL,rL
3017                     if (I64 && reg >= 4 || r >= 4)
3018                         code_orrex(cdb.last(), REX);
3019                     goto L2;
3020                 }
3021                 if (!(I64 && reg >= 4) &&
3022                     r < 4 && ((regcon.immed.value[r] >> 8) & 0xFF) == value)
3023                 {
3024                     genregs(cdb,0x8A,reg,r | 4);      // MOV regL,rH
3025                     goto L2;
3026                 }
3027             }
3028             r++;
3029         }
3030 
3031         if (value == 0 && !(flags & 8))
3032         {
3033             if (!(flags & 4) &&                 // if we can set the whole register
3034                 !(flags & 16 && reg & 4))       // and reg is not an H register
3035             {
3036                 genregs(cdb,0x31,reg,reg);      // XOR reg,reg
3037                 regimmed_set(reg,value);
3038                 regv = 0;
3039             }
3040             else
3041                 genregs(cdb,0x30,reg,reg);      // XOR regL,regL
3042             flags &= ~mPSW;                     // flags already set by XOR
3043         }
3044         else
3045         {
3046             cdb.genc2(0xC6,modregrmx(3,0,reg),value);  // MOV regL,value
3047             if (reg >= 4 && I64)
3048             {
3049                 code_orrex(cdb.last(), REX);
3050             }
3051         }
3052     L2:
3053         if (flags & mPSW)
3054             genregs(cdb,0x84,reg,reg);            // TEST regL,regL
3055 
3056         if (regm)
3057             // Set just the 'L' part of the register value
3058             regimmed_set(reg,(regv & ~cast(targ_size_t)0xFF) | value);
3059         else if (flags & 16 && reg & 4 && regcon.immed.mval & mask(reg & 3))
3060             // Set just the 'H' part of the register value
3061             regimmed_set((reg & 3),(regv & ~cast(targ_size_t)0xFF00) | (value << 8));
3062         return;
3063     }
3064 L3:
3065     if (I16)
3066         value = cast(targ_short) value;             // sign-extend MSW
3067     else if (I32)
3068         value = cast(targ_int) value;
3069 
3070     if (!I16 && flags & 2)                      // load 16 bit value
3071     {
3072         value &= 0xFFFF;
3073         if (value && !(flags & mPSW))
3074         {
3075             cdb.genc2(0xC7,modregrmx(3,0,reg),value); // MOV reg,value
3076             regimmed_set(reg, value);
3077             return;
3078         }
3079     }
3080 
3081     // If we already have the right value in the right register
3082     if (regm && (regv & 0xFFFFFFFF) == (value & 0xFFFFFFFF) && !(flags & 64))
3083     {
3084         if (flags & mPSW)
3085             gentstreg(cdb,reg);
3086     }
3087     else if (flags & 64 && regm && regv == value)
3088     {   // Look at the full 64 bits
3089         if (flags & mPSW)
3090         {
3091             gentstreg(cdb,reg);
3092             code_orrex(cdb.last(), REX_W);
3093         }
3094     }
3095     else
3096     {
3097         if (flags & mPSW)
3098         {
3099             switch (value)
3100             {
3101                 case 0:
3102                     genregs(cdb,0x31,reg,reg);
3103                     break;
3104 
3105                 case 1:
3106                     if (I64)
3107                         goto L4;
3108                     genregs(cdb,0x31,reg,reg);
3109                     goto inc;
3110 
3111                 case ~cast(targ_size_t)0:
3112                     if (I64)
3113                         goto L4;
3114                     genregs(cdb,0x31,reg,reg);
3115                     goto dec;
3116 
3117                 default:
3118                 L4:
3119                     if (flags & 64)
3120                     {
3121                         cdb.genc2(0xB8 + (reg&7),REX_W << 16 | (reg&8) << 13,value); // MOV reg,value64
3122                         gentstreg(cdb,reg);
3123                         code_orrex(cdb.last(), REX_W);
3124                     }
3125                     else
3126                     {
3127                         value &= 0xFFFFFFFF;
3128                         cdb.genc2(0xB8 + (reg&7),(reg&8) << 13,value); // MOV reg,value
3129                         gentstreg(cdb,reg);
3130                     }
3131                     break;
3132             }
3133         }
3134         else
3135         {
3136             // Look for single byte conversion
3137             if (regcon.immed.mval & mAX)
3138             {
3139                 if (I32)
3140                 {
3141                     if (reg == AX && value == cast(targ_short) regv)
3142                     {
3143                         cdb.gen1(0x98);               // CWDE
3144                         goto done;
3145                     }
3146                     if (reg == DX &&
3147                         value == (regcon.immed.value[AX] & 0x80000000 ? 0xFFFFFFFF : 0) &&
3148                         !(config.flags4 & CFG4speed && config.target_cpu >= TARGET_Pentium)
3149                        )
3150                     {
3151                         cdb.gen1(0x99);               // CDQ
3152                         goto done;
3153                     }
3154                 }
3155                 else if (I16)
3156                 {
3157                     if (reg == AX &&
3158                         cast(targ_short) value == cast(byte) regv)
3159                     {
3160                         cdb.gen1(0x98);               // CBW
3161                         goto done;
3162                     }
3163 
3164                     if (reg == DX &&
3165                         cast(targ_short) value == (regcon.immed.value[AX] & 0x8000 ? cast(targ_short) 0xFFFF : cast(targ_short) 0) &&
3166                         !(config.flags4 & CFG4speed && config.target_cpu >= TARGET_Pentium)
3167                        )
3168                     {
3169                         cdb.gen1(0x99);               // CWD
3170                         goto done;
3171                     }
3172                 }
3173             }
3174             if (value == 0 && !(flags & 8) && config.target_cpu >= TARGET_80486)
3175             {
3176                 genregs(cdb,0x31,reg,reg);              // XOR reg,reg
3177                 goto done;
3178             }
3179 
3180             if (!I64 && regm && !(flags & 8))
3181             {
3182                 if (regv + 1 == value ||
3183                     // Catch case of (0xFFFF+1 == 0) for 16 bit compiles
3184                     (I16 && cast(targ_short)(regv + 1) == cast(targ_short)value))
3185                 {
3186                 inc:
3187                     cdb.gen1(0x40 + reg);     // INC reg
3188                     goto done;
3189                 }
3190                 if (regv - 1 == value)
3191                 {
3192                 dec:
3193                     cdb.gen1(0x48 + reg);     // DEC reg
3194                     goto done;
3195                 }
3196             }
3197 
3198             // See if another register has the right value
3199             r = 0;
3200             for (mreg = regcon.immed.mval; mreg; mreg >>= 1)
3201             {
3202                 debug
3203                 assert(!I16 || regcon.immed.value[r] == cast(targ_short)regcon.immed.value[r]);
3204 
3205                 if (mreg & 1 && regcon.immed.value[r] == value)
3206                 {
3207                     genmovreg(cdb,reg,r);
3208                     goto done;
3209                 }
3210                 r++;
3211             }
3212 
3213             if (value == 0 && !(flags & 8))
3214             {
3215                 genregs(cdb,0x31,reg,reg);              // XOR reg,reg
3216             }
3217             else
3218             {   // See if we can just load a byte
3219                 if (regm & BYTEREGS &&
3220                     !(config.flags4 & CFG4speed && config.target_cpu >= TARGET_PentiumPro)
3221                    )
3222                 {
3223                     if ((regv & ~cast(targ_size_t)0xFF) == (value & ~cast(targ_size_t)0xFF))
3224                     {
3225                         movregconst(cdb,reg,value,(flags & 8) |4|1);  // load regL
3226                         return;
3227                     }
3228                     if (regm & (mAX|mBX|mCX|mDX) &&
3229                         (regv & ~cast(targ_size_t)0xFF00) == (value & ~cast(targ_size_t)0xFF00) &&
3230                         !I64)
3231                     {
3232                         movregconst(cdb,4|reg,value >> 8,(flags & 8) |4|1|16); // load regH
3233                         return;
3234                     }
3235                 }
3236                 if (flags & 64)
3237                     cdb.genc2(0xB8 + (reg&7),REX_W << 16 | (reg&8) << 13,value); // MOV reg,value64
3238                 else
3239                 {
3240                     value &= 0xFFFFFFFF;
3241                     cdb.genc2(0xB8 + (reg&7),(reg&8) << 13,value); // MOV reg,value
3242                 }
3243             }
3244         }
3245     done:
3246         regimmed_set(reg,value);
3247     }
3248 }
3249 
3250 /**************************
3251  * Generate a jump instruction.
3252  */
3253 
3254 @trusted
3255 void genjmp(ref CodeBuilder cdb,opcode_t op,uint fltarg,block *targ)
3256 {
3257     code cs;
3258     cs.Iop = op & 0xFF;
3259     cs.Iflags = 0;
3260     cs.Irex = 0;
3261     if (op != JMP && op != 0xE8)        // if not already long branch
3262           cs.Iflags = CFjmp16;          // assume long branch for op = 0x7x
3263     cs.IFL2 = cast(ubyte)fltarg;        // FLblock (or FLcode)
3264     cs.IEV2.Vblock = targ;              // target block (or code)
3265     if (fltarg == FLcode)
3266         (cast(code *)targ).Iflags |= CFtarg;
3267 
3268     if (config.flags4 & CFG4fastfloat)  // if fast floating point
3269     {
3270         cdb.gen(&cs);
3271         return;
3272     }
3273 
3274     switch (op & 0xFF00)                // look at second jump opcode
3275     {
3276         // The JP and JNP come from floating point comparisons
3277         case JP << 8:
3278             cdb.gen(&cs);
3279             cs.Iop = JP;
3280             cdb.gen(&cs);
3281             break;
3282 
3283         case JNP << 8:
3284         {
3285             // Do a JP around the jump instruction
3286             code *cnop = gennop(null);
3287             genjmp(cdb,JP,FLcode,cast(block *) cnop);
3288             cdb.gen(&cs);
3289             cdb.append(cnop);
3290             break;
3291         }
3292 
3293         case 1 << 8:                    // toggled no jump
3294         case 0 << 8:
3295             cdb.gen(&cs);
3296             break;
3297 
3298         default:
3299             debug
3300             printf("jop = x%x\n",op);
3301             assert(0);
3302     }
3303 }
3304 
3305 /*********************************************
3306  * Generate first part of prolog for interrupt function.
3307  */
3308 @trusted
3309 void prolog_ifunc(ref CodeBuilder cdb, tym_t* tyf)
3310 {
3311     static immutable ubyte[4] ops2 = [ 0x60,0x1E,0x06,0 ];
3312     static immutable ubyte[11] ops0 = [ 0x50,0x51,0x52,0x53,
3313                                     0x54,0x55,0x56,0x57,
3314                                     0x1E,0x06,0 ];
3315 
3316     immutable(ubyte)* p = (config.target_cpu >= TARGET_80286) ? ops2.ptr : ops0.ptr;
3317     do
3318         cdb.gen1(*p);
3319     while (*++p);
3320 
3321     genregs(cdb,0x8B,BP,SP);     // MOV BP,SP
3322     if (localsize)
3323         cod3_stackadj(cdb, cast(int)localsize);
3324 
3325     *tyf |= mTYloadds;
3326 }
3327 
3328 @trusted
3329 void prolog_ifunc2(ref CodeBuilder cdb, tym_t tyf, tym_t tym, bool pushds)
3330 {
3331     /* Determine if we need to reload DS        */
3332     if (tyf & mTYloadds)
3333     {
3334         if (!pushds)                           // if not already pushed
3335             cdb.gen1(0x1E);                    // PUSH DS
3336         spoff += _tysize[TYint];
3337         cdb.genc(0xC7,modregrm(3,0,AX),0,0,FLdatseg,cast(targ_uns) 0); // MOV  AX,DGROUP
3338         code *c = cdb.last();
3339         c.IEV2.Vseg = DATA;
3340         c.Iflags ^= CFseg | CFoff;            // turn off CFoff, on CFseg
3341         cdb.gen2(0x8E,modregrm(3,3,AX));       // MOV  DS,AX
3342         useregs(mAX);
3343     }
3344 
3345     if (tym == TYifunc)
3346         cdb.gen1(0xFC);                        // CLD
3347 }
3348 
3349 @trusted
3350 void prolog_16bit_windows_farfunc(ref CodeBuilder cdb, tym_t* tyf, bool* pushds)
3351 {
3352     int wflags = config.wflags;
3353     if (wflags & WFreduced && !(*tyf & mTYexport))
3354     {   // reduced prolog/epilog for non-exported functions
3355         wflags &= ~(WFdgroup | WFds | WFss);
3356     }
3357 
3358     getregsNoSave(mAX);                     // should not have any value in AX
3359 
3360     int segreg;
3361     switch (wflags & (WFdgroup | WFds | WFss))
3362     {
3363         case WFdgroup:                      // MOV  AX,DGROUP
3364         {
3365             if (wflags & WFreduced)
3366                 *tyf &= ~mTYloadds;          // remove redundancy
3367             cdb.genc(0xC7,modregrm(3,0,AX),0,0,FLdatseg,cast(targ_uns) 0);
3368             code *c = cdb.last();
3369             c.IEV2.Vseg = DATA;
3370             c.Iflags ^= CFseg | CFoff;     // turn off CFoff, on CFseg
3371             break;
3372         }
3373 
3374         case WFss:
3375             segreg = 2;                     // SS
3376             goto Lmovax;
3377 
3378         case WFds:
3379             segreg = 3;                     // DS
3380         Lmovax:
3381             cdb.gen2(0x8C,modregrm(3,segreg,AX)); // MOV AX,segreg
3382             if (wflags & WFds)
3383                 cdb.gen1(0x90);             // NOP
3384             break;
3385 
3386         case 0:
3387             break;
3388 
3389         default:
3390             debug
3391             printf("config.wflags = x%x\n",config.wflags);
3392             assert(0);
3393     }
3394     if (wflags & WFincbp)
3395         cdb.gen1(0x40 + BP);              // INC  BP
3396     cdb.gen1(0x50 + BP);                  // PUSH BP
3397     genregs(cdb,0x8B,BP,SP); // MOV  BP,SP
3398     if (wflags & (WFsaveds | WFds | WFss | WFdgroup))
3399     {
3400         cdb.gen1(0x1E);                       // PUSH DS
3401         *pushds = true;
3402         BPoff = -REGSIZE;
3403     }
3404     if (wflags & (WFds | WFss | WFdgroup))
3405         cdb.gen2(0x8E,modregrm(3,3,AX));      // MOV  DS,AX
3406 }
3407 
3408 /**********************************************
3409  * Set up frame register.
3410  * Params:
3411  *      cdb        = write generated code here
3412  *      farfunc    = true if a far function
3413  *      enter      = set to true if ENTER instruction can be used, false otherwise
3414  *      xlocalsize = amount of local variables, set to amount to be subtracted from stack pointer
3415  *      cfa_offset = set to frame pointer's offset from the CFA
3416  * Returns:
3417  *      generated code
3418  */
3419 @trusted
3420 void prolog_frame(ref CodeBuilder cdb, bool farfunc, ref uint xlocalsize, out bool enter, out int cfa_offset)
3421 {
3422     //printf("prolog_frame\n");
3423     cfa_offset = 0;
3424 
3425     if (0 && config.exe == EX_WIN64)
3426     {
3427         // PUSH RBP
3428         // LEA RBP,0[RSP]
3429         cdb. gen1(0x50 + BP);
3430         cdb.genc1(LEA,(REX_W<<16) | (modregrm(0,4,SP)<<8) | modregrm(2,BP,4),FLconst,0);
3431         enter = false;
3432         return;
3433     }
3434 
3435     if (config.wflags & WFincbp && farfunc)
3436         cdb.gen1(0x40 + BP);      // INC  BP
3437     if (config.target_cpu < TARGET_80286 ||
3438         config.exe & (EX_posix | EX_WIN64) ||
3439         !localsize ||
3440         config.flags & CFGstack ||
3441         (xlocalsize >= 0x1000 && config.exe & EX_flat) ||
3442         localsize >= 0x10000 ||
3443         (NTEXCEPTIONS == 2 &&
3444          (usednteh & (NTEH_try | NTEH_except | NTEHcpp | EHcleanup | EHtry | NTEHpassthru) && (config.ehmethod == EHmethod.EH_WIN32 && !(funcsym_p.Sfunc.Fflags3 & Feh_none) || config.ehmethod == EHmethod.EH_SEH))) ||
3445         (config.target_cpu >= TARGET_80386 &&
3446          config.flags4 & CFG4speed)
3447        )
3448     {
3449         cdb.gen1(0x50 + BP);      // PUSH BP
3450         genregs(cdb,0x8B,BP,SP);      // MOV  BP,SP
3451         if (I64)
3452             code_orrex(cdb.last(), REX_W);   // MOV RBP,RSP
3453         if ((config.objfmt & (OBJ_ELF | OBJ_MACH)) && config.fulltypes)
3454             // Don't reorder instructions, as dwarf CFA relies on it
3455             code_orflag(cdb.last(), CFvolatile);
3456 static if (NTEXCEPTIONS == 2)
3457 {
3458         if (usednteh & (NTEH_try | NTEH_except | NTEHcpp | EHcleanup | EHtry | NTEHpassthru) && (config.ehmethod == EHmethod.EH_WIN32 && !(funcsym_p.Sfunc.Fflags3 & Feh_none) || config.ehmethod == EHmethod.EH_SEH))
3459         {
3460             nteh_prolog(cdb);
3461             int sz = nteh_contextsym_size();
3462             assert(sz != 0);        // should be 5*4, not 0
3463             xlocalsize -= sz;      // sz is already subtracted from ESP
3464                                     // by nteh_prolog()
3465         }
3466 }
3467         if (config.fulltypes == CVDWARF_C || config.fulltypes == CVDWARF_D ||
3468             config.ehmethod == EHmethod.EH_DWARF)
3469         {
3470             int off = 2 * REGSIZE;      // 1 for the return address + 1 for the PUSH EBP
3471             dwarf_CFA_set_loc(1);           // address after PUSH EBP
3472             dwarf_CFA_set_reg_offset(SP, off); // CFA is now 8[ESP]
3473             dwarf_CFA_offset(BP, -off);       // EBP is at 0[ESP]
3474             dwarf_CFA_set_loc(I64 ? 4 : 3);   // address after MOV EBP,ESP
3475             /* Oddly, the CFA is not the same as the frame pointer,
3476              * which is why the offset of BP is set to 8
3477              */
3478             dwarf_CFA_set_reg_offset(BP, off);        // CFA is now 0[EBP]
3479             cfa_offset = off;  // remember the difference between the CFA and the frame pointer
3480         }
3481         enter = false;              /* do not use ENTER instruction */
3482     }
3483     else
3484         enter = true;
3485 }
3486 
3487 /**********************************************
3488  * Enforce stack alignment.
3489  * Input:
3490  *      cdb     code builder.
3491  * Returns:
3492  *      generated code
3493  */
3494 @trusted
3495 void prolog_stackalign(ref CodeBuilder cdb)
3496 {
3497     if (!enforcealign)
3498         return;
3499 
3500     const offset = (hasframe ? 2 : 1) * REGSIZE;   // 1 for the return address + 1 for the PUSH EBP
3501     if (offset & (STACKALIGN - 1) || TARGET_STACKALIGN < STACKALIGN)
3502         cod3_stackalign(cdb, STACKALIGN);
3503 }
3504 
3505 @trusted
3506 void prolog_frameadj(ref CodeBuilder cdb, tym_t tyf, uint xlocalsize, bool enter, bool* pushalloc)
3507 {
3508     uint pushallocreg = (tyf == TYmfunc) ? CX : AX;
3509 
3510     bool check;
3511     if (config.exe & (EX_LINUX | EX_LINUX64))
3512         check = false;               // seems that Linux doesn't need to fault in stack pages
3513     else
3514         check = (config.flags & CFGstack && !(I32 && xlocalsize < 0x1000)) // if stack overflow check
3515             || (config.exe & (EX_windos & EX_flat) && xlocalsize >= 0x1000);
3516 
3517     if (check)
3518     {
3519         if (I16)
3520         {
3521             // BUG: Won't work if parameter is passed in AX
3522             movregconst(cdb,AX,xlocalsize,false); // MOV AX,localsize
3523             makeitextern(getRtlsym(RTLSYM.CHKSTK));
3524                                                     // CALL _chkstk
3525             cdb.gencs((LARGECODE) ? 0x9A : CALL,0,FLfunc,getRtlsym(RTLSYM.CHKSTK));
3526             useregs((ALLREGS | mBP | mES) & ~getRtlsym(RTLSYM.CHKSTK).Sregsaved);
3527         }
3528         else
3529         {
3530             /* Watch out for 64 bit code where EDX is passed as a register parameter
3531              */
3532             reg_t reg = I64 ? R11 : DX;  // scratch register
3533 
3534             /*      MOV     EDX, xlocalsize/0x1000
3535              *  L1: SUB     ESP, 0x1000
3536              *      TEST    [ESP],ESP
3537              *      DEC     EDX
3538              *      JNE     L1
3539              *      SUB     ESP, xlocalsize % 0x1000
3540              */
3541             movregconst(cdb, reg, xlocalsize / 0x1000, false);
3542             cod3_stackadj(cdb, 0x1000);
3543             code_orflag(cdb.last(), CFtarg2);
3544             cdb.gen2sib(0x85, modregrm(0,SP,4),modregrm(0,4,SP));
3545             if (I64)
3546             {   cdb.gen2(0xFF, modregrmx(3,1,R11));   // DEC R11D
3547                 cdb.genc2(JNE,0,cast(targ_uns)-15);
3548             }
3549             else
3550             {   cdb.gen1(0x48 + DX);                  // DEC EDX
3551                 cdb.genc2(JNE,0,cast(targ_uns)-12);
3552             }
3553             regimmed_set(reg,0);             // reg is now 0
3554             cod3_stackadj(cdb, xlocalsize & 0xFFF);
3555             useregs(mask(reg));
3556         }
3557     }
3558     else
3559     {
3560         if (enter)
3561         {   // ENTER xlocalsize,0
3562             cdb.genc(ENTER,0,FLconst,xlocalsize,FLconst,cast(targ_uns) 0);
3563             assert(!(config.fulltypes == CVDWARF_C || config.fulltypes == CVDWARF_D)); // didn't emit Dwarf data
3564         }
3565         else if (xlocalsize == REGSIZE && config.flags4 & CFG4optimized)
3566         {
3567             cdb. gen1(0x50 + pushallocreg);    // PUSH AX
3568             // Do this to prevent an -x[EBP] to be moved in
3569             // front of the push.
3570             code_orflag(cdb.last(),CFvolatile);
3571             *pushalloc = true;
3572         }
3573         else
3574             cod3_stackadj(cdb, xlocalsize);
3575     }
3576 }
3577 
3578 void prolog_frameadj2(ref CodeBuilder cdb, tym_t tyf, uint xlocalsize, bool* pushalloc)
3579 {
3580     uint pushallocreg = (tyf == TYmfunc) ? CX : AX;
3581     if (xlocalsize == REGSIZE)
3582     {
3583         cdb.gen1(0x50 + pushallocreg);      // PUSH AX
3584         *pushalloc = true;
3585     }
3586     else if (xlocalsize == 2 * REGSIZE)
3587     {
3588         cdb.gen1(0x50 + pushallocreg);      // PUSH AX
3589         cdb.gen1(0x50 + pushallocreg);      // PUSH AX
3590         *pushalloc = true;
3591     }
3592     else
3593         cod3_stackadj(cdb, xlocalsize);
3594 }
3595 
3596 @trusted
3597 void prolog_setupalloca(ref CodeBuilder cdb)
3598 {
3599     //printf("prolog_setupalloca() offset x%x size x%x alignment x%x\n",
3600         //cast(int)Alloca.offset, cast(int)Alloca.size, cast(int)Alloca.alignment);
3601     // Set up magic parameter for alloca()
3602     // MOV -REGSIZE[BP],localsize - BPoff
3603     cdb.genc(0xC7,modregrm(2,0,BPRM),
3604             FLconst,Alloca.offset + BPoff,
3605             FLconst,localsize - BPoff);
3606     if (I64)
3607         code_orrex(cdb.last(), REX_W);
3608 }
3609 
3610 /**************************************
3611  * Save registers that the function destroys,
3612  * but that the ABI says should be preserved across
3613  * function calls.
3614  *
3615  * Emit Dwarf info for these saves.
3616  * Params:
3617  *      cdb = append generated instructions to this
3618  *      topush = mask of registers to push
3619  *      cfa_offset = offset of frame pointer from CFA
3620  */
3621 
3622 @trusted
3623 void prolog_saveregs(ref CodeBuilder cdb, regm_t topush, int cfa_offset)
3624 {
3625     if (pushoffuse)
3626     {
3627         // Save to preallocated section in the stack frame
3628         int xmmtopush = popcnt(topush & XMMREGS);   // XMM regs take 16 bytes
3629         int gptopush = popcnt(topush) - xmmtopush;  // general purpose registers to save
3630         targ_size_t xmmoffset = pushoff + BPoff;
3631         if (!hasframe || enforcealign)
3632             xmmoffset += EBPtoESP;
3633         targ_size_t gpoffset = xmmoffset + xmmtopush * 16;
3634         while (topush)
3635         {
3636             reg_t reg = findreg(topush);
3637             topush &= ~mask(reg);
3638             if (isXMMreg(reg))
3639             {
3640                 if (hasframe && !enforcealign)
3641                 {
3642                     // MOVUPD xmmoffset[EBP],xmm
3643                     cdb.genc1(STOUPD,modregxrm(2,reg-XMM0,BPRM),FLconst,xmmoffset);
3644                 }
3645                 else
3646                 {
3647                     // MOVUPD xmmoffset[ESP],xmm
3648                     cdb.genc1(STOUPD,modregxrm(2,reg-XMM0,4) + 256*modregrm(0,4,SP),FLconst,xmmoffset);
3649                 }
3650                 xmmoffset += 16;
3651             }
3652             else
3653             {
3654                 if (hasframe && !enforcealign)
3655                 {
3656                     // MOV gpoffset[EBP],reg
3657                     cdb.genc1(0x89,modregxrm(2,reg,BPRM),FLconst,gpoffset);
3658                 }
3659                 else
3660                 {
3661                     // MOV gpoffset[ESP],reg
3662                     cdb.genc1(0x89,modregxrm(2,reg,4) + 256*modregrm(0,4,SP),FLconst,gpoffset);
3663                 }
3664                 if (I64)
3665                     code_orrex(cdb.last(), REX_W);
3666                 if (config.fulltypes == CVDWARF_C || config.fulltypes == CVDWARF_D ||
3667                     config.ehmethod == EHmethod.EH_DWARF)
3668                 {   // Emit debug_frame data giving location of saved register
3669                     code *c = cdb.finish();
3670                     pinholeopt(c, null);
3671                     dwarf_CFA_set_loc(calcblksize(c));  // address after save
3672                     dwarf_CFA_offset(reg, cast(int)(gpoffset - cfa_offset));
3673                     cdb.reset();
3674                     cdb.append(c);
3675                 }
3676                 gpoffset += REGSIZE;
3677             }
3678         }
3679     }
3680     else
3681     {
3682         while (topush)                      /* while registers to push      */
3683         {
3684             reg_t reg = findreg(topush);
3685             topush &= ~mask(reg);
3686             if (isXMMreg(reg))
3687             {
3688                 // SUB RSP,16
3689                 cod3_stackadj(cdb, 16);
3690                 // MOVUPD 0[RSP],xmm
3691                 cdb.genc1(STOUPD,modregxrm(2,reg-XMM0,4) + 256*modregrm(0,4,SP),FLconst,0);
3692                 EBPtoESP += 16;
3693                 spoff += 16;
3694             }
3695             else
3696             {
3697                 genpush(cdb, reg);
3698                 EBPtoESP += REGSIZE;
3699                 spoff += REGSIZE;
3700                 if (config.fulltypes == CVDWARF_C || config.fulltypes == CVDWARF_D ||
3701                     config.ehmethod == EHmethod.EH_DWARF)
3702                 {   // Emit debug_frame data giving location of saved register
3703                     // relative to 0[EBP]
3704                     code *c = cdb.finish();
3705                     pinholeopt(c, null);
3706                     dwarf_CFA_set_loc(calcblksize(c));  // address after PUSH reg
3707                     dwarf_CFA_offset(reg, -EBPtoESP - cfa_offset);
3708                     cdb.reset();
3709                     cdb.append(c);
3710                 }
3711             }
3712         }
3713     }
3714 }
3715 
3716 /**************************************
3717  * Undo prolog_saveregs()
3718  */
3719 
3720 @trusted
3721 private void epilog_restoreregs(ref CodeBuilder cdb, regm_t topop)
3722 {
3723     debug
3724     if (topop & ~(XMMREGS | 0xFFFF))
3725         printf("fregsaved = %s, mfuncreg = %s\n",regm_str(fregsaved),regm_str(mfuncreg));
3726 
3727     assert(!(topop & ~(XMMREGS | 0xFFFF)));
3728     if (pushoffuse)
3729     {
3730         // Save to preallocated section in the stack frame
3731         int xmmtopop = popcnt(topop & XMMREGS);   // XMM regs take 16 bytes
3732         int gptopop = popcnt(topop) - xmmtopop;   // general purpose registers to save
3733         targ_size_t xmmoffset = pushoff + BPoff;
3734         if (!hasframe || enforcealign)
3735             xmmoffset += EBPtoESP;
3736         targ_size_t gpoffset = xmmoffset + xmmtopop * 16;
3737         while (topop)
3738         {
3739             reg_t reg = findreg(topop);
3740             topop &= ~mask(reg);
3741             if (isXMMreg(reg))
3742             {
3743                 if (hasframe && !enforcealign)
3744                 {
3745                     // MOVUPD xmm,xmmoffset[EBP]
3746                     cdb.genc1(LODUPD,modregxrm(2,reg-XMM0,BPRM),FLconst,xmmoffset);
3747                 }
3748                 else
3749                 {
3750                     // MOVUPD xmm,xmmoffset[ESP]
3751                     cdb.genc1(LODUPD,modregxrm(2,reg-XMM0,4) + 256*modregrm(0,4,SP),FLconst,xmmoffset);
3752                 }
3753                 xmmoffset += 16;
3754             }
3755             else
3756             {
3757                 if (hasframe && !enforcealign)
3758                 {
3759                     // MOV reg,gpoffset[EBP]
3760                     cdb.genc1(0x8B,modregxrm(2,reg,BPRM),FLconst,gpoffset);
3761                 }
3762                 else
3763                 {
3764                     // MOV reg,gpoffset[ESP]
3765                     cdb.genc1(0x8B,modregxrm(2,reg,4) + 256*modregrm(0,4,SP),FLconst,gpoffset);
3766                 }
3767                 if (I64)
3768                     code_orrex(cdb.last(), REX_W);
3769                 gpoffset += REGSIZE;
3770             }
3771         }
3772     }
3773     else
3774     {
3775         reg_t reg = I64 ? XMM7 : DI;
3776         if (!(topop & XMMREGS))
3777             reg = R15;
3778         regm_t regm = 1 << reg;
3779 
3780         while (topop)
3781         {   if (topop & regm)
3782             {
3783                 if (isXMMreg(reg))
3784                 {
3785                     // MOVUPD xmm,0[RSP]
3786                     cdb.genc1(LODUPD,modregxrm(2,reg-XMM0,4) + 256*modregrm(0,4,SP),FLconst,0);
3787                     // ADD RSP,16
3788                     cod3_stackadj(cdb, -16);
3789                 }
3790                 else
3791                 {
3792                     cdb.gen1(0x58 + (reg & 7));         // POP reg
3793                     if (reg & 8)
3794                         code_orrex(cdb.last(), REX_B);
3795                 }
3796                 topop &= ~regm;
3797             }
3798             regm >>= 1;
3799             reg--;
3800         }
3801     }
3802 }
3803 
3804 /******************************
3805  * Generate special varargs prolog for Posix 64 bit systems.
3806  * Params:
3807  *      cdb = sink for generated code
3808  *      sv = symbol for __va_argsave
3809  */
3810 @trusted
3811 void prolog_genvarargs(ref CodeBuilder cdb, Symbol* sv)
3812 {
3813     /* Generate code to move any arguments passed in registers into
3814      * the stack variable __va_argsave,
3815      * so we can reference it via pointers through va_arg().
3816      *   struct __va_argsave_t {
3817      *     size_t[6] regs;
3818      *     real[8] fpregs;
3819      *     uint offset_regs;
3820      *     uint offset_fpregs;
3821      *     void* stack_args;
3822      *     void* reg_args;
3823      *   }
3824      * The MOVAPS instructions seg fault if data is not aligned on
3825      * 16 bytes, so this gives us a nice check to ensure no mistakes.
3826         MOV     voff+0*8[RBP],EDI
3827         MOV     voff+1*8[RBP],ESI
3828         MOV     voff+2*8[RBP],RDX
3829         MOV     voff+3*8[RBP],RCX
3830         MOV     voff+4*8[RBP],R8
3831         MOV     voff+5*8[RBP],R9
3832         TEST    AL,AL
3833         LEA     RAX,voff+6*8+0x7F[RBP]
3834         JE      L2
3835 
3836         MOVAPS  -0x0F[RAX],XMM7             // only save XMM registers if actually used
3837         MOVAPS  -0x1F[RAX],XMM6
3838         MOVAPS  -0x2F[RAX],XMM5
3839         MOVAPS  -0x3F[RAX],XMM4
3840         MOVAPS  -0x4F[RAX],XMM3
3841         MOVAPS  -0x5F[RAX],XMM2
3842         MOVAPS  -0x6F[RAX],XMM1
3843         MOVAPS  -0x7F[RAX],XMM0
3844 
3845       L2:
3846         LEA     R11, Para.size+Para.offset[RBP]
3847         MOV     9+16[RAX],R11                // set __va_argsave.stack_args
3848     * RAX and R11 are destroyed.
3849     */
3850 
3851     /* Save registers into the voff area on the stack
3852      */
3853     targ_size_t voff = Auto.size + BPoff + sv.Soffset;  // EBP offset of start of sv
3854     const int vregnum = 6;
3855     const uint vsize = vregnum * 8 + 8 * 16;
3856 
3857     static immutable reg_t[vregnum] regs = [ DI,SI,DX,CX,R8,R9 ];
3858 
3859     if (!hasframe || enforcealign)
3860         voff += EBPtoESP;
3861 
3862     regm_t namedargs = prolog_namedArgs();
3863     foreach (i, r; regs)
3864     {
3865         if (!(mask(r) & namedargs))  // unnamed arguments would be the ... ones
3866         {
3867             uint ea = (REX_W << 16) | modregxrm(2,r,BPRM);
3868             if (!hasframe || enforcealign)
3869                 ea = (REX_W << 16) | (modregrm(0,4,SP) << 8) | modregxrm(2,r,4);
3870             cdb.genc1(0x89,ea,FLconst,voff + i*8);  // MOV voff+i*8[RBP],r
3871         }
3872     }
3873 
3874     code* cnop = gennop(null);
3875     genregs(cdb,0x84,AX,AX);                   // TEST AL,AL
3876 
3877     uint ea = (REX_W << 16) | modregrm(2,AX,BPRM);
3878     if (!hasframe || enforcealign)
3879         // add sib byte for [RSP] addressing
3880         ea = (REX_W << 16) | (modregrm(0,4,SP) << 8) | modregxrm(2,AX,4);
3881     int raxoff = cast(int)(voff+6*8+0x7F);
3882     cdb.genc1(LEA,ea,FLconst,raxoff);          // LEA RAX,voff+vsize-6*8-16+0x7F[RBP]
3883 
3884     genjmp(cdb,JE,FLcode, cast(block *)cnop);  // JE L2
3885 
3886     foreach (i; 0 .. 8)
3887     {
3888         // MOVAPS -15-16*i[RAX],XMM7-i
3889         cdb.genc1(0x0F29,modregrm(0,XMM7-i,0),FLconst,-15-16*i);
3890     }
3891     cdb.append(cnop);
3892 
3893     // LEA R11, Para.size+Para.offset[RBP]
3894     uint ea2 = modregxrm(2,R11,BPRM);
3895     if (!hasframe)
3896         ea2 = (modregrm(0,4,SP) << 8) | modregrm(2,DX,4);
3897     Para.offset = (Para.offset + (REGSIZE - 1)) & ~(REGSIZE - 1);
3898     cdb.genc1(LEA,(REX_W << 16) | ea2,FLconst,Para.size + Para.offset);
3899 
3900     // MOV 9+16[RAX],R11
3901     cdb.genc1(0x89,(REX_W << 16) | modregxrm(2,R11,AX),FLconst,9 + 16);   // into stack_args_save
3902 
3903     pinholeopt(cdb.peek(), null);
3904     useregs(mAX|mR11);
3905 }
3906 
3907 /********************************
3908  * Generate elems for va_start()
3909  * Params:
3910  *      sv = symbol for __va_argsave
3911  *      parmn = last named parameter
3912  */
3913 @trusted
3914 elem* prolog_genva_start(Symbol* sv, Symbol* parmn)
3915 {
3916     enum Vregnum = 6;
3917 
3918     /* the stack variable __va_argsave points to an instance of:
3919      *   struct __va_argsave_t {
3920      *     size_t[Vregnum] regs;
3921      *     real[8] fpregs;
3922      *     struct __va_list_tag {
3923      *         uint offset_regs;
3924      *         uint offset_fpregs;
3925      *         void* stack_args;
3926      *         void* reg_args;
3927      *     }
3928      *     void* stack_args_save;
3929      *   }
3930      */
3931 
3932     enum OFF // offsets into __va_argsave_t
3933     {
3934         Offset_regs   = Vregnum*8 + 8*16,
3935         Offset_fpregs = Offset_regs + 4,
3936         Stack_args    = Offset_fpregs + 4,
3937         Reg_args      = Stack_args + 8,
3938         Stack_args_save = Reg_args + 8,
3939     }
3940 
3941     /* Compute offset_regs and offset_fpregs
3942      */
3943     regm_t namedargs = prolog_namedArgs();
3944     uint offset_regs = 0;
3945     uint offset_fpregs = Vregnum * 8;
3946     for (int i = AX; i <= XMM7; i++)
3947     {
3948         regm_t m = mask(i);
3949         if (m & namedargs)
3950         {
3951             if (m & (mDI|mSI|mDX|mCX|mR8|mR9))
3952                 offset_regs += 8;
3953             else if (m & XMMREGS)
3954                 offset_fpregs += 16;
3955             namedargs &= ~m;
3956             if (!namedargs)
3957                 break;
3958         }
3959     }
3960 
3961     // set offset_regs
3962     elem* e1 = el_bin(OPeq, TYint, el_var(sv), el_long(TYint, offset_regs));
3963     e1.EV.E1.Ety = TYint;
3964     e1.EV.E1.EV.Voffset = OFF.Offset_regs;
3965 
3966     // set offset_fpregs
3967     elem* e2 = el_bin(OPeq, TYint, el_var(sv), el_long(TYint, offset_fpregs));
3968     e2.EV.E1.Ety = TYint;
3969     e2.EV.E1.EV.Voffset = OFF.Offset_fpregs;
3970 
3971     // set reg_args
3972     elem* e4 = el_bin(OPeq, TYnptr, el_var(sv), el_ptr(sv));
3973     e4.EV.E1.Ety = TYnptr;
3974     e4.EV.E1.EV.Voffset = OFF.Reg_args;
3975 
3976     // set stack_args
3977     /* which is a pointer to the first variadic argument on the stack.
3978      * Normally, we could set it by taking the address of the last named parameter
3979      * (parmn) and then skipping past it. The trouble, though, is it fails
3980      * when all the named parameters get passed in a register.
3981      *    elem* e3 = el_bin(OPeq, TYnptr, el_var(sv), el_ptr(parmn));
3982      *    e3.EV.E1.Ety = TYnptr;
3983      *    e3.EV.E1.EV.Voffset = OFF.Stack_args;
3984      *    auto sz = type_size(parmn.Stype);
3985      *    sz = (sz + (REGSIZE - 1)) & ~(REGSIZE - 1);
3986      *    e3.EV.E2.EV.Voffset += sz;
3987      * The next possibility is to do it the way prolog_genvarargs() does:
3988      *    LEA R11, Para.size+Para.offset[RBP]
3989      * The trouble there is Para.size and Para.offset is not available when
3990      * this function is called. It might be possible to compute this earlier.(1)
3991      * Another possibility is creating a special operand type that gets filled
3992      * in after the prolog_genvarargs() is called.
3993      * Or do it this simpler way - compute the needed value in prolog_genvarargs(),
3994      * and save it in a slot just after va_argsave, called `stack_args_save`.
3995      * Then, just copy from `stack_args_save` to `stack_args`.
3996      * Although, doing (1) might be optimal.
3997      */
3998     elem* e3 = el_bin(OPeq, TYnptr, el_var(sv), el_var(sv));
3999     e3.EV.E1.Ety = TYnptr;
4000     e3.EV.E1.EV.Voffset = OFF.Stack_args;
4001     e3.EV.E2.Ety = TYnptr;
4002     e3.EV.E2.EV.Voffset = OFF.Stack_args_save;
4003 
4004     elem* e = el_combine(e1, el_combine(e2, el_combine(e3, e4)));
4005     return e;
4006 }
4007 
4008 void prolog_gen_win64_varargs(ref CodeBuilder cdb)
4009 {
4010     /* The Microsoft scheme.
4011      * https://msdn.microsoft.com/en-US/library/dd2wa36c%28v=vs.100%29
4012      * Copy registers onto stack.
4013          mov     8[RSP],RCX
4014          mov     010h[RSP],RDX
4015          mov     018h[RSP],R8
4016          mov     020h[RSP],R9
4017      */
4018 }
4019 
4020 /************************************
4021  * Get mask of registers that named parameters (not ... variadic arguments) were passed in.
4022  * Returns:
4023  *      the mask
4024  */
4025 @trusted regm_t prolog_namedArgs()
4026 {
4027     regm_t namedargs;
4028     foreach (s; globsym[])
4029     {
4030         if (s.Sclass == SC.fastpar || s.Sclass == SC.shadowreg)
4031             namedargs |= s.Spregm();
4032     }
4033     return namedargs;
4034 }
4035 
4036 /************************************
4037  * Params:
4038  *      cdb = generated code sink
4039  *      tf = what's the type of the function
4040  *      pushalloc = use PUSH to allocate on the stack rather than subtracting from SP
4041  */
4042 @trusted
4043 void prolog_loadparams(ref CodeBuilder cdb, tym_t tyf, bool pushalloc)
4044 {
4045     //printf("prolog_loadparams() %s\n", funcsym_p.Sident.ptr);
4046     debug
4047     for (SYMIDX si = 0; si < globsym.length; si++)
4048     {
4049         Symbol *s = globsym[si];
4050         if (debugr && (s.Sclass == SC.fastpar || s.Sclass == SC.shadowreg))
4051         {
4052             printf("symbol '%s' is fastpar in register [l %s, m %s]\n", s.Sident.ptr,
4053                 regm_str(mask(s.Spreg)),
4054                 (s.Spreg2 == NOREG ? "NOREG" : regm_str(mask(s.Spreg2))));
4055             if (s.Sfl == FLreg)
4056                 printf("\tassigned to register %s\n", regm_str(mask(s.Sreglsw)));
4057         }
4058     }
4059 
4060     uint pushallocreg = (tyf == TYmfunc) ? CX : AX;
4061 
4062     /* Copy SCfastpar and SCshadowreg (parameters passed in registers) that were not assigned
4063      * registers into their stack locations.
4064      */
4065     regm_t shadowregm = 0;
4066     for (SYMIDX si = 0; si < globsym.length; si++)
4067     {
4068         Symbol *s = globsym[si];
4069         uint sz = cast(uint)type_size(s.Stype);
4070 
4071         if (!((s.Sclass == SC.fastpar || s.Sclass == SC.shadowreg) && s.Sfl != FLreg))
4072             continue;
4073         // Argument is passed in a register
4074 
4075         type *t = s.Stype;
4076         type *t2 = null;
4077 
4078         tym_t tyb = tybasic(t.Tty);
4079 
4080         // This logic is same as FuncParamRegs_alloc function at src/dmd/backend/cod1.d
4081         //
4082         // Find suitable SROA based on the element type
4083         // (Don't put volatile parameters in registers on Windows)
4084         if (tyb == TYarray && (config.exe != EX_WIN64 || !(t.Tty & mTYvolatile)))
4085         {
4086             type *targ1;
4087             argtypes(t, targ1, t2);
4088             if (targ1)
4089                 t = targ1;
4090         }
4091 
4092         // If struct just wraps another type
4093         if (tyb == TYstruct)
4094         {
4095             // On windows 64 bits, structs occupy a general purpose register,
4096             // regardless of the struct size or the number & types of its fields.
4097             if (config.exe != EX_WIN64)
4098             {
4099                 type *targ1 = t.Ttag.Sstruct.Sarg1type;
4100                 t2 = t.Ttag.Sstruct.Sarg2type;
4101                 if (targ1)
4102                     t = targ1;
4103             }
4104         }
4105 
4106         if (Symbol_Sisdead(*s, anyiasm))
4107         {
4108             // Ignore it, as it is never referenced
4109             continue;
4110         }
4111 
4112         targ_size_t offset = Fast.size + BPoff;
4113         if (s.Sclass == SC.shadowreg)
4114             offset = Para.size;
4115         offset += s.Soffset;
4116         if (!hasframe || (enforcealign && s.Sclass != SC.shadowreg))
4117             offset += EBPtoESP;
4118 
4119         reg_t preg = s.Spreg;
4120         foreach (i; 0 .. 2)     // twice, once for each possible parameter register
4121         {
4122             static type* type_arrayBase(type* ta)
4123             {
4124                 while (tybasic(ta.Tty) == TYarray)
4125                     ta = ta.Tnext;
4126                 return ta;
4127             }
4128             shadowregm |= mask(preg);
4129             const opcode_t op = isXMMreg(preg)
4130                 ? xmmstore(type_arrayBase(t).Tty)
4131                 : 0x89;    // MOV x[EBP],preg
4132             if (!(pushalloc && preg == pushallocreg) || s.Sclass == SC.shadowreg)
4133             {
4134                 if (hasframe && (!enforcealign || s.Sclass == SC.shadowreg))
4135                 {
4136                     // MOV x[EBP],preg
4137                     cdb.genc1(op,modregxrm(2,preg,BPRM),FLconst,offset);
4138                     if (isXMMreg(preg))
4139                     {
4140                         checkSetVex(cdb.last(), t.Tty);
4141                     }
4142                     else
4143                     {
4144                         //printf("%s Fast.size = %d, BPoff = %d, Soffset = %d, sz = %d\n",
4145                         //         s.Sident, (int)Fast.size, (int)BPoff, (int)s.Soffset, (int)sz);
4146                         if (I64 && sz > 4)
4147                             code_orrex(cdb.last(), REX_W);
4148                     }
4149                 }
4150                 else
4151                 {
4152                     // MOV offset[ESP],preg
4153                     // BUG: byte size?
4154                     cdb.genc1(op,
4155                               (modregrm(0,4,SP) << 8) |
4156                                modregxrm(2,preg,4),FLconst,offset);
4157                     if (isXMMreg(preg))
4158                     {
4159                         checkSetVex(cdb.last(), t.Tty);
4160                     }
4161                     else
4162                     {
4163                         if (I64 && sz > 4)
4164                             cdb.last().Irex |= REX_W;
4165                     }
4166                 }
4167             }
4168             preg = s.Spreg2;
4169             if (preg == NOREG)
4170                 break;
4171             if (t2)
4172                 t = t2;
4173             offset += REGSIZE;
4174         }
4175     }
4176 
4177     if (config.exe == EX_WIN64 && variadic(funcsym_p.Stype))
4178     {
4179         /* The Microsoft scheme.
4180          * https://msdn.microsoft.com/en-US/library/dd2wa36c%28v=vs.100%29
4181          * Copy registers onto stack.
4182              mov     8[RSP],RCX or XMM0
4183              mov     010h[RSP],RDX or XMM1
4184              mov     018h[RSP],R8 or XMM2
4185              mov     020h[RSP],R9 or XMM3
4186          */
4187         static immutable reg_t[4] vregs = [ CX,DX,R8,R9 ];
4188         for (int i = 0; i < vregs.length; ++i)
4189         {
4190             uint preg = vregs[i];
4191             uint offset = cast(uint)(Para.size + i * REGSIZE);
4192             if (!(shadowregm & (mask(preg) | mask(XMM0 + i))))
4193             {
4194                 if (hasframe)
4195                 {
4196                     // MOV x[EBP],preg
4197                     cdb.genc1(0x89,
4198                                      modregxrm(2,preg,BPRM),FLconst, offset);
4199                     code_orrex(cdb.last(), REX_W);
4200                 }
4201                 else
4202                 {
4203                     // MOV offset[ESP],preg
4204                     cdb.genc1(0x89,
4205                                      (modregrm(0,4,SP) << 8) |
4206                                      modregxrm(2,preg,4),FLconst,offset + EBPtoESP);
4207                 }
4208                 cdb.last().Irex |= REX_W;
4209             }
4210         }
4211     }
4212 
4213     /* Copy SCfastpar and SCshadowreg (parameters passed in registers) that were assigned registers
4214      * into their assigned registers.
4215      * Note that we have a big problem if Pa is passed in R1 and assigned to R2,
4216      * and Pb is passed in R2 but assigned to R1. Detect it and assert.
4217      */
4218     regm_t assignregs = 0;
4219     for (SYMIDX si = 0; si < globsym.length; si++)
4220     {
4221         Symbol *s = globsym[si];
4222         uint sz = cast(uint)type_size(s.Stype);
4223 
4224         if (!((s.Sclass == SC.fastpar || s.Sclass == SC.shadowreg) && s.Sfl == FLreg))
4225         {
4226             // Argument is passed in a register
4227             continue;
4228         }
4229 
4230         type *t = s.Stype;
4231         type *t2 = null;
4232         if (tybasic(t.Tty) == TYstruct && config.exe != EX_WIN64)
4233         {   type *targ1 = t.Ttag.Sstruct.Sarg1type;
4234             t2 = t.Ttag.Sstruct.Sarg2type;
4235             if (targ1)
4236                 t = targ1;
4237         }
4238 
4239         reg_t preg = s.Spreg;
4240         reg_t r = s.Sreglsw;
4241         for (int i = 0; i < 2; ++i)
4242         {
4243             if (preg == NOREG)
4244                 break;
4245             assert(!(mask(preg) & assignregs));         // not already stepped on
4246             assignregs |= mask(r);
4247 
4248             // MOV reg,preg
4249             if (r == preg)
4250             {
4251             }
4252             else if (mask(preg) & XMMREGS)
4253             {
4254                 const op = xmmload(t.Tty);      // MOVSS/D xreg,preg
4255                 uint xreg = r - XMM0;
4256                 cdb.gen2(op,modregxrmx(3,xreg,preg - XMM0));
4257             }
4258             else
4259             {
4260                 //printf("test1 mov %s, %s\n", regstring[r], regstring[preg]);
4261                 genmovreg(cdb,r,preg);
4262                 if (I64 && sz == 8)
4263                     code_orrex(cdb.last(), REX_W);
4264             }
4265             preg = s.Spreg2;
4266             r = s.Sregmsw;
4267             if (t2)
4268                 t = t2;
4269         }
4270     }
4271 
4272     /* For parameters that were passed on the stack, but are enregistered,
4273      * initialize the registers with the parameter stack values.
4274      * Do not use assignaddr(), as it will replace the stack reference with
4275      * the register.
4276      */
4277     for (SYMIDX si = 0; si < globsym.length; si++)
4278     {
4279         Symbol *s = globsym[si];
4280         uint sz = cast(uint)type_size(s.Stype);
4281 
4282         if (!((s.Sclass == SC.regpar || s.Sclass == SC.parameter) &&
4283             s.Sfl == FLreg &&
4284             (refparam
4285                 // This variable has been reference by a nested function
4286                 || MARS && s.Stype.Tty & mTYvolatile
4287                 )))
4288         {
4289             continue;
4290         }
4291         // MOV reg,param[BP]
4292         //assert(refparam);
4293         if (mask(s.Sreglsw) & XMMREGS)
4294         {
4295             const op = xmmload(s.Stype.Tty);  // MOVSS/D xreg,mem
4296             uint xreg = s.Sreglsw - XMM0;
4297             cdb.genc1(op,modregxrm(2,xreg,BPRM),FLconst,Para.size + s.Soffset);
4298             if (!hasframe)
4299             {   // Convert to ESP relative address rather than EBP
4300                 code *c = cdb.last();
4301                 c.Irm = cast(ubyte)modregxrm(2,xreg,4);
4302                 c.Isib = modregrm(0,4,SP);
4303                 c.IEV1.Vpointer += EBPtoESP;
4304             }
4305             continue;
4306         }
4307 
4308         cdb.genc1(sz == 1 ? 0x8A : 0x8B,
4309             modregxrm(2,s.Sreglsw,BPRM),FLconst,Para.size + s.Soffset);
4310         code *c = cdb.last();
4311         if (!I16 && sz == SHORTSIZE)
4312             c.Iflags |= CFopsize; // operand size
4313         if (I64 && sz >= REGSIZE)
4314             c.Irex |= REX_W;
4315         if (I64 && sz == 1 && s.Sreglsw >= 4)
4316             c.Irex |= REX;
4317         if (!hasframe)
4318         {   // Convert to ESP relative address rather than EBP
4319             assert(!I16);
4320             c.Irm = cast(ubyte)modregxrm(2,s.Sreglsw,4);
4321             c.Isib = modregrm(0,4,SP);
4322             c.IEV1.Vpointer += EBPtoESP;
4323         }
4324         if (sz > REGSIZE)
4325         {
4326             cdb.genc1(0x8B,
4327                 modregxrm(2,s.Sregmsw,BPRM),FLconst,Para.size + s.Soffset + REGSIZE);
4328             code *cx = cdb.last();
4329             if (I64)
4330                 cx.Irex |= REX_W;
4331             if (!hasframe)
4332             {   // Convert to ESP relative address rather than EBP
4333                 assert(!I16);
4334                 cx.Irm = cast(ubyte)modregxrm(2,s.Sregmsw,4);
4335                 cx.Isib = modregrm(0,4,SP);
4336                 cx.IEV1.Vpointer += EBPtoESP;
4337             }
4338         }
4339     }
4340 }
4341 
4342 /*******************************
4343  * Generate and return function epilog.
4344  * Output:
4345  *      retsize         Size of function epilog
4346  */
4347 
4348 @trusted
4349 void epilog(block *b)
4350 {
4351     code *cpopds;
4352     reg_t reg;
4353     reg_t regx;                      // register that's not a return reg
4354     regm_t topop,regm;
4355     targ_size_t xlocalsize = localsize;
4356 
4357     CodeBuilder cdbx; cdbx.ctor();
4358     tym_t tyf = funcsym_p.ty();
4359     tym_t tym = tybasic(tyf);
4360     bool farfunc = tyfarfunc(tym) != 0;
4361     if (!(b.Bflags & BFLepilog))       // if no epilog code
4362         goto Lret;                      // just generate RET
4363     regx = (b.BC == BCret) ? AX : CX;
4364 
4365     retsize = 0;
4366 
4367     if (tyf & mTYnaked)                 // if no prolog/epilog
4368         return;
4369 
4370     if (tym == TYifunc)
4371     {
4372         static immutable ubyte[5] ops2 = [ 0x07,0x1F,0x61,0xCF,0 ];
4373         static immutable ubyte[12] ops0 = [ 0x07,0x1F,0x5F,0x5E,
4374                                         0x5D,0x5B,0x5B,0x5A,
4375                                         0x59,0x58,0xCF,0 ];
4376 
4377         genregs(cdbx,0x8B,SP,BP);              // MOV SP,BP
4378         auto p = (config.target_cpu >= TARGET_80286) ? ops2.ptr : ops0.ptr;
4379         do
4380             cdbx.gen1(*p);
4381         while (*++p);
4382         goto Lopt;
4383     }
4384 
4385     if (config.flags & CFGtrace &&
4386         (!(config.flags4 & CFG4allcomdat) ||
4387          funcsym_p.Sclass == SC.comdat ||
4388          funcsym_p.Sclass == SC.global ||
4389          (config.flags2 & CFG2comdat && SymInline(funcsym_p))
4390         )
4391        )
4392     {
4393         Symbol *s = getRtlsym(farfunc ? RTLSYM.TRACE_EPI_F : RTLSYM.TRACE_EPI_N);
4394         makeitextern(s);
4395         cdbx.gencs(I16 ? 0x9A : CALL,0,FLfunc,s);      // CALLF _trace
4396         if (!I16)
4397             code_orflag(cdbx.last(),CFoff | CFselfrel);
4398         useregs((ALLREGS | mBP | mES) & ~s.Sregsaved);
4399     }
4400 
4401     if (usednteh & (NTEH_try | NTEH_except | NTEHcpp | EHcleanup | EHtry | NTEHpassthru) && (config.exe == EX_WIN32 || MARS))
4402     {
4403         nteh_epilog(cdbx);
4404     }
4405 
4406     cpopds = null;
4407     if (tyf & mTYloadds)
4408     {
4409         cdbx.gen1(0x1F);             // POP DS
4410         cpopds = cdbx.last();
4411     }
4412 
4413     /* Pop all the general purpose registers saved on the stack
4414      * by the prolog code. Remember to do them in the reverse
4415      * order they were pushed.
4416      */
4417     topop = fregsaved & ~mfuncreg;
4418     epilog_restoreregs(cdbx, topop);
4419 
4420     if (usednteh & NTEHjmonitor)
4421     {
4422         regm_t retregs = 0;
4423         if (b.BC == BCretexp)
4424             retregs = regmask(b.Belem.Ety, tym);
4425         nteh_monitor_epilog(cdbx,retregs);
4426         xlocalsize += 8;
4427     }
4428 
4429     if (config.wflags & WFwindows && farfunc)
4430     {
4431         int wflags = config.wflags;
4432         if (wflags & WFreduced && !(tyf & mTYexport))
4433         {   // reduced prolog/epilog for non-exported functions
4434             wflags &= ~(WFdgroup | WFds | WFss);
4435             if (!(wflags & WFsaveds))
4436                 goto L4;
4437         }
4438 
4439         if (localsize)
4440         {
4441             cdbx.genc1(LEA,modregrm(1,SP,6),FLconst,cast(targ_uns)-2); /* LEA SP,-2[BP] */
4442         }
4443         if (wflags & (WFsaveds | WFds | WFss | WFdgroup))
4444         {
4445             if (cpopds)
4446                 cpopds.Iop = NOP;              // don't need previous one
4447             cdbx.gen1(0x1F);                    // POP DS
4448         }
4449         cdbx.gen1(0x58 + BP);                   // POP BP
4450         if (config.wflags & WFincbp)
4451             cdbx.gen1(0x48 + BP);               // DEC BP
4452         assert(hasframe);
4453     }
4454     else
4455     {
4456         if (needframe || (xlocalsize && hasframe))
4457         {
4458         L4:
4459             assert(hasframe);
4460             if (xlocalsize || enforcealign)
4461             {
4462                 if (config.flags2 & CFG2stomp)
4463                 {   /*   MOV  ECX,0xBEAF
4464                      * L1:
4465                      *   MOV  [ESP],ECX
4466                      *   ADD  ESP,4
4467                      *   CMP  EBP,ESP
4468                      *   JNE  L1
4469                      *   POP  EBP
4470                      */
4471                     /* Value should be:
4472                      * 1. != 0 (code checks for null pointers)
4473                      * 2. be odd (to mess up alignment)
4474                      * 3. fall in first 64K (likely marked as inaccessible)
4475                      * 4. be a value that stands out in the debugger
4476                      */
4477                     assert(I32 || I64);
4478                     targ_size_t value = 0x0000BEAF;
4479                     reg_t regcx = CX;
4480                     mfuncreg &= ~mask(regcx);
4481                     uint grex = I64 ? REX_W << 16 : 0;
4482                     cdbx.genc2(0xC7,grex | modregrmx(3,0,regcx),value);   // MOV regcx,value
4483                     cdbx.gen2sib(0x89,grex | modregrm(0,regcx,4),modregrm(0,4,SP)); // MOV [ESP],regcx
4484                     code *c1 = cdbx.last();
4485                     cdbx.genc2(0x81,grex | modregrm(3,0,SP),REGSIZE);     // ADD ESP,REGSIZE
4486                     genregs(cdbx,0x39,SP,BP);                             // CMP EBP,ESP
4487                     if (I64)
4488                         code_orrex(cdbx.last(),REX_W);
4489                     genjmp(cdbx,JNE,FLcode,cast(block *)c1);                  // JNE L1
4490                     // explicitly mark as short jump, needed for correct retsize calculation (Bugzilla 15779)
4491                     cdbx.last().Iflags &= ~CFjmp16;
4492                     cdbx.gen1(0x58 + BP);                                 // POP BP
4493                 }
4494                 else if (config.exe == EX_WIN64)
4495                 {   // See https://msdn.microsoft.com/en-us/library/tawsa7cb%28v=vs.100%29.aspx
4496                     // LEA RSP,0[RBP]
4497                     cdbx.genc1(LEA,(REX_W<<16)|modregrm(2,SP,BPRM),FLconst,0);
4498                     cdbx.gen1(0x58 + BP);      // POP RBP
4499                 }
4500                 else if (config.target_cpu >= TARGET_80286 &&
4501                     !(config.target_cpu >= TARGET_80386 && config.flags4 & CFG4speed)
4502                    )
4503                     cdbx.gen1(LEAVE);          // LEAVE
4504                 else if (0 && xlocalsize == REGSIZE && Alloca.size == 0 && I32)
4505                 {   // This doesn't work - I should figure out why
4506                     mfuncreg &= ~mask(regx);
4507                     cdbx.gen1(0x58 + regx);    // POP regx
4508                     cdbx.gen1(0x58 + BP);      // POP BP
4509                 }
4510                 else
4511                 {
4512                     genregs(cdbx,0x8B,SP,BP);  // MOV SP,BP
4513                     if (I64)
4514                         code_orrex(cdbx.last(), REX_W);   // MOV RSP,RBP
4515                     cdbx.gen1(0x58 + BP);      // POP BP
4516                 }
4517             }
4518             else
4519                 cdbx.gen1(0x58 + BP);          // POP BP
4520             if (config.wflags & WFincbp && farfunc)
4521                 cdbx.gen1(0x48 + BP);              // DEC BP
4522         }
4523         else if (xlocalsize == REGSIZE && (!I16 || b.BC == BCret))
4524         {
4525             mfuncreg &= ~mask(regx);
4526             cdbx.gen1(0x58 + regx);                    // POP regx
4527         }
4528         else if (xlocalsize)
4529             cod3_stackadj(cdbx, cast(int)-xlocalsize);
4530     }
4531     if (b.BC == BCret || b.BC == BCretexp)
4532     {
4533 Lret:
4534         opcode_t op = tyfarfunc(tym) ? 0xCA : 0xC2;
4535         if (tym == TYhfunc)
4536         {
4537             cdbx.genc2(0xC2,0,4);                       // RET 4
4538         }
4539         else if (!typfunc(tym) ||                       // if caller cleans the stack
4540                  config.exe == EX_WIN64 ||
4541                  Para.offset == 0)                      // or nothing pushed on the stack anyway
4542         {
4543             op++;                                       // to a regular RET
4544             cdbx.gen1(op);
4545         }
4546         else
4547         {   // Stack is always aligned on register size boundary
4548             Para.offset = (Para.offset + (REGSIZE - 1)) & ~(REGSIZE - 1);
4549             if (Para.offset >= 0x10000)
4550             {
4551                 /*
4552                     POP REG
4553                     ADD ESP, Para.offset
4554                     JMP REG
4555                 */
4556                 cdbx.gen1(0x58+regx);
4557                 cdbx.genc2(0x81, modregrm(3,0,SP), Para.offset);
4558                 if (I64)
4559                     code_orrex(cdbx.last(), REX_W);
4560                 cdbx.genc2(0xFF, modregrm(3,4,regx), 0);
4561                 if (I64)
4562                     code_orrex(cdbx.last(), REX_W);
4563             }
4564             else
4565                 cdbx.genc2(op,0,Para.offset);          // RET Para.offset
4566         }
4567     }
4568 
4569 Lopt:
4570     // If last instruction in ce is ADD SP,imm, and first instruction
4571     // in c sets SP, we can dump the ADD.
4572     CodeBuilder cdb; cdb.ctor();
4573     cdb.append(b.Bcode);
4574     code *cr = cdb.last();
4575     code *c = cdbx.peek();
4576     if (cr && c && !I64)
4577     {
4578         if (cr.Iop == 0x81 && cr.Irm == modregrm(3,0,SP))     // if ADD SP,imm
4579         {
4580             if (
4581                 c.Iop == LEAVE ||                                // LEAVE
4582                 (c.Iop == 0x8B && c.Irm == modregrm(3,SP,BP)) || // MOV SP,BP
4583                 (c.Iop == LEA && c.Irm == modregrm(1,SP,6))     // LEA SP,-imm[BP]
4584                )
4585                 cr.Iop = NOP;
4586             else if (c.Iop == 0x58 + BP)                       // if POP BP
4587             {
4588                 cr.Iop = 0x8B;
4589                 cr.Irm = modregrm(3,SP,BP);                    // MOV SP,BP
4590             }
4591         }
4592         else
4593         {
4594 static if (0)
4595 {
4596         // These optimizations don't work if the called function
4597         // cleans off the stack.
4598         if (c.Iop == 0xC3 && cr.Iop == CALL)     // CALL near
4599         {
4600             cr.Iop = 0xE9;                             // JMP near
4601             c.Iop = NOP;
4602         }
4603         else if (c.Iop == 0xCB && cr.Iop == 0x9A)     // CALL far
4604         {
4605             cr.Iop = 0xEA;                             // JMP far
4606             c.Iop = NOP;
4607         }
4608 }
4609         }
4610     }
4611 
4612     pinholeopt(c, null);
4613     retsize += calcblksize(c);          // compute size of function epilog
4614     cdb.append(cdbx);
4615     b.Bcode = cdb.finish();
4616 }
4617 
4618 /*******************************
4619  * Return offset of SP from BP.
4620  */
4621 
4622 @trusted
4623 targ_size_t cod3_spoff()
4624 {
4625     //printf("spoff = x%x, localsize = x%x\n", cast(int)spoff, cast(int)localsize);
4626     return spoff + localsize;
4627 }
4628 
4629 @trusted
4630 void gen_spill_reg(ref CodeBuilder cdb, Symbol* s, bool toreg)
4631 {
4632     code cs;
4633     const regm_t keepmsk = toreg ? RMload : RMstore;
4634 
4635     elem* e = el_var(s); // so we can trick getlvalue() into working for us
4636 
4637     if (mask(s.Sreglsw) & XMMREGS)
4638     {   // Convert to save/restore of XMM register
4639         if (toreg)
4640             cs.Iop = xmmload(s.Stype.Tty);        // MOVSS/D xreg,mem
4641         else
4642             cs.Iop = xmmstore(s.Stype.Tty);       // MOVSS/D mem,xreg
4643         getlvalue(cdb,&cs,e,keepmsk);
4644         cs.orReg(s.Sreglsw - XMM0);
4645         cdb.gen(&cs);
4646     }
4647     else
4648     {
4649         const int sz = cast(int)type_size(s.Stype);
4650         cs.Iop = toreg ? 0x8B : 0x89; // MOV reg,mem[ESP] : MOV mem[ESP],reg
4651         cs.Iop ^= (sz == 1);
4652         getlvalue(cdb,&cs,e,keepmsk);
4653         cs.orReg(s.Sreglsw);
4654         if (I64 && sz == 1 && s.Sreglsw >= 4)
4655             cs.Irex |= REX;
4656         if ((cs.Irm & 0xC0) == 0xC0 &&                  // reg,reg
4657             (((cs.Irm >> 3) ^ cs.Irm) & 7) == 0 &&      // registers match
4658             (((cs.Irex >> 2) ^ cs.Irex) & 1) == 0)      // REX_R and REX_B match
4659         { }                                             // skip MOV reg,reg
4660         else
4661             cdb.gen(&cs);
4662         if (sz > REGSIZE)
4663         {
4664             cs.setReg(s.Sregmsw);
4665             getlvalue_msw(&cs);
4666             if ((cs.Irm & 0xC0) == 0xC0 &&              // reg,reg
4667                 (((cs.Irm >> 3) ^ cs.Irm) & 7) == 0 &&  // registers match
4668                 (((cs.Irex >> 2) ^ cs.Irex) & 1) == 0)  // REX_R and REX_B match
4669             { }                                         // skip MOV reg,reg
4670             else
4671                 cdb.gen(&cs);
4672         }
4673     }
4674 
4675     el_free(e);
4676 }
4677 
4678 /****************************
4679  * Generate code for, and output a thunk.
4680  * Params:
4681  *      sthunk =  Symbol of thunk
4682  *      sfunc =   Symbol of thunk's target function
4683  *      thisty =  Type of this pointer
4684  *      p =       ESP parameter offset to this pointer
4685  *      d =       offset to add to 'this' pointer
4686  *      d2 =      offset from 'this' to vptr
4687  *      i =       offset into vtbl[]
4688  */
4689 
4690 @trusted
4691 void cod3_thunk(Symbol *sthunk,Symbol *sfunc,uint p,tym_t thisty,
4692         uint d,int i,uint d2)
4693 {
4694     targ_size_t thunkoffset;
4695 
4696     int seg = sthunk.Sseg;
4697     cod3_align(seg);
4698 
4699     // Skip over return address
4700     tym_t thunkty = tybasic(sthunk.ty());
4701     if (tyfarfunc(thunkty))
4702         p += I32 ? 8 : tysize(TYfptr);          // far function
4703     else
4704         p += tysize(TYnptr);
4705     if (tybasic(sfunc.ty()) == TYhfunc)
4706         p += tysize(TYnptr);                    // skip over hidden pointer
4707 
4708     CodeBuilder cdb; cdb.ctor();
4709     if (!I16)
4710     {
4711         /*
4712            Generate:
4713             ADD p[ESP],d
4714            For direct call:
4715             JMP sfunc
4716            For virtual call:
4717             MOV EAX, p[ESP]                     EAX = this
4718             MOV EAX, d2[EAX]                    EAX = this.vptr
4719             JMP i[EAX]                          jump to virtual function
4720          */
4721         if (config.flags3 & CFG3ibt)
4722             cdb.gen1(I32 ? ENDBR32 : ENDBR64);
4723 
4724         reg_t reg = 0;
4725         if (cast(int)d < 0)
4726         {
4727             d = -d;
4728             reg = 5;                            // switch from ADD to SUB
4729         }
4730         if (thunkty == TYmfunc)
4731         {                                       // ADD ECX,d
4732             if (d)
4733                 cdb.genc2(0x81,modregrm(3,reg,CX),d);
4734         }
4735         else if (thunkty == TYjfunc || (I64 && thunkty == TYnfunc))
4736         {                                       // ADD EAX,d
4737             int rm = AX;
4738             if (config.exe == EX_WIN64)
4739                 rm = CX;
4740             else if (I64)
4741                 rm = (thunkty == TYnfunc && (sfunc.Sfunc.Fflags3 & F3hiddenPtr)) ? SI : DI;
4742             if (d)
4743                 cdb.genc2(0x81,modregrm(3,reg,rm),d);
4744         }
4745         else
4746         {
4747             cdb.genc(0x81,modregrm(2,reg,4),
4748                 FLconst,p,                      // to this
4749                 FLconst,d);                     // ADD p[ESP],d
4750             cdb.last().Isib = modregrm(0,4,SP);
4751         }
4752         if (I64 && cdb.peek())
4753             cdb.last().Irex |= REX_W;
4754     }
4755     else
4756     {
4757         /*
4758            Generate:
4759             MOV BX,SP
4760             ADD [SS:] p[BX],d
4761            For direct call:
4762             JMP sfunc
4763            For virtual call:
4764             MOV BX, p[BX]                       BX = this
4765             MOV BX, d2[BX]                      BX = this.vptr
4766             JMP i[BX]                           jump to virtual function
4767          */
4768 
4769         genregs(cdb,0x89,SP,BX);           // MOV BX,SP
4770         cdb.genc(0x81,modregrm(2,0,7),
4771             FLconst,p,                                  // to this
4772             FLconst,d);                                 // ADD p[BX],d
4773         if (config.wflags & WFssneds ||
4774             // If DS needs reloading from SS,
4775             // then assume SS != DS on thunk entry
4776             (LARGEDATA && config.wflags & WFss))
4777             cdb.last().Iflags |= CFss;                 // SS:
4778     }
4779 
4780     if ((i & 0xFFFF) != 0xFFFF)                 // if virtual call
4781     {
4782         const bool FARTHIS = (tysize(thisty) > REGSIZE);
4783         const bool FARVPTR = FARTHIS;
4784 
4785         assert(thisty != TYvptr);               // can't handle this case
4786 
4787         if (!I16)
4788         {
4789             assert(!FARTHIS && !LARGECODE);
4790             if (thunkty == TYmfunc)     // if 'this' is in ECX
4791             {
4792                 // MOV EAX,d2[ECX]
4793                 cdb.genc1(0x8B,modregrm(2,AX,CX),FLconst,d2);
4794             }
4795             else if (thunkty == TYjfunc)        // if 'this' is in EAX
4796             {
4797                 // MOV EAX,d2[EAX]
4798                 cdb.genc1(0x8B,modregrm(2,AX,AX),FLconst,d2);
4799             }
4800             else
4801             {
4802                 // MOV EAX,p[ESP]
4803                 cdb.genc1(0x8B,(modregrm(0,4,SP) << 8) | modregrm(2,AX,4),FLconst,cast(targ_uns) p);
4804                 if (I64)
4805                     cdb.last().Irex |= REX_W;
4806 
4807                 // MOV EAX,d2[EAX]
4808                 cdb.genc1(0x8B,modregrm(2,AX,AX),FLconst,d2);
4809             }
4810             if (I64)
4811                 code_orrex(cdb.last(), REX_W);
4812                                                         // JMP i[EAX]
4813             cdb.genc1(0xFF,modregrm(2,4,0),FLconst,cast(targ_uns) i);
4814         }
4815         else
4816         {
4817             // MOV/LES BX,[SS:] p[BX]
4818             cdb.genc1((FARTHIS ? 0xC4 : 0x8B),modregrm(2,BX,7),FLconst,cast(targ_uns) p);
4819             if (config.wflags & WFssneds ||
4820                 // If DS needs reloading from SS,
4821                 // then assume SS != DS on thunk entry
4822                 (LARGEDATA && config.wflags & WFss))
4823                 cdb.last().Iflags |= CFss;             // SS:
4824 
4825             // MOV/LES BX,[ES:]d2[BX]
4826             cdb.genc1((FARVPTR ? 0xC4 : 0x8B),modregrm(2,BX,7),FLconst,d2);
4827             if (FARTHIS)
4828                 cdb.last().Iflags |= CFes;             // ES:
4829 
4830                                                         // JMP i[BX]
4831             cdb.genc1(0xFF,modregrm(2,(LARGECODE ? 5 : 4),7),FLconst,cast(targ_uns) i);
4832             if (FARVPTR)
4833                 cdb.last().Iflags |= CFes;             // ES:
4834         }
4835     }
4836     else
4837     {
4838         if (config.flags3 & CFG3pic)
4839         {
4840             localgot = null;                // no local variables
4841             CodeBuilder cdbgot; cdbgot.ctor();
4842             load_localgot(cdbgot);          // load GOT in EBX
4843             code *c1 = cdbgot.finish();
4844             if (c1)
4845             {
4846                 assignaddrc(c1);
4847                 cdb.append(c1);
4848             }
4849         }
4850         cdb.gencs((LARGECODE ? 0xEA : 0xE9),0,FLfunc,sfunc); // JMP sfunc
4851         cdb.last().Iflags |= LARGECODE ? (CFseg | CFoff) : (CFselfrel | CFoff);
4852     }
4853 
4854     thunkoffset = Offset(seg);
4855     code *c = cdb.finish();
4856     pinholeopt(c,null);
4857     codout(seg,c,null);
4858     code_free(c);
4859 
4860     sthunk.Soffset = thunkoffset;
4861     sthunk.Ssize = Offset(seg) - thunkoffset; // size of thunk
4862     sthunk.Sseg = seg;
4863     if (config.exe & EX_posix ||
4864        config.objfmt == OBJ_MSCOFF)
4865     {
4866         objmod.pubdef(seg,sthunk,sthunk.Soffset);
4867     }
4868 }
4869 
4870 /*****************************
4871  * Assume symbol s is extern.
4872  */
4873 
4874 @trusted
4875 void makeitextern(Symbol *s)
4876 {
4877     if (s.Sxtrnnum == 0)
4878     {
4879         s.Sclass = SC.extern_;           /* external             */
4880         /*printf("makeitextern(x%x)\n",s);*/
4881         objmod.external(s);
4882     }
4883 }
4884 
4885 
4886 /*******************************
4887  * Replace JMPs in Bgotocode with JMP SHORTs whereever possible.
4888  * This routine depends on FLcode jumps to only be forward
4889  * referenced.
4890  * BFLjmpoptdone is set to true if nothing more can be done
4891  * with this block.
4892  * Input:
4893  *      flag    !=0 means don't have correct Boffsets yet
4894  * Returns:
4895  *      number of bytes saved
4896  */
4897 
4898 @trusted
4899 int branch(block *bl,int flag)
4900 {
4901     int bytesaved;
4902     code* c,cn,ct;
4903     targ_size_t offset,disp;
4904     targ_size_t csize;
4905 
4906     if (!flag)
4907         bl.Bflags |= BFLjmpoptdone;      // assume this will be all
4908     c = bl.Bcode;
4909     if (!c)
4910         return 0;
4911     bytesaved = 0;
4912     offset = bl.Boffset;                 /* offset of start of block     */
4913     while (1)
4914     {
4915         ubyte op;
4916 
4917         csize = calccodsize(c);
4918         cn = code_next(c);
4919         op = cast(ubyte)c.Iop;
4920         if ((op & ~0x0F) == 0x70 && c.Iflags & CFjmp16 ||
4921             (op == JMP && !(c.Iflags & CFjmp5)))
4922         {
4923           L1:
4924             switch (c.IFL2)
4925             {
4926                 case FLblock:
4927                     if (flag)           // no offsets yet, don't optimize
4928                         goto L3;
4929                     disp = c.IEV2.Vblock.Boffset - offset - csize;
4930 
4931                     /* If this is a forward branch, and there is an aligned
4932                      * block intervening, it is possible that shrinking
4933                      * the jump instruction will cause it to be out of
4934                      * range of the target. This happens if the alignment
4935                      * prevents the target block from moving correspondingly
4936                      * closer.
4937                      */
4938                     if (disp >= 0x7F-4 && c.IEV2.Vblock.Boffset > offset)
4939                     {   /* Look for intervening alignment
4940                          */
4941                         for (block *b = bl.Bnext; b; b = b.Bnext)
4942                         {
4943                             if (b.Balign)
4944                             {
4945                                 bl.Bflags &= ~BFLjmpoptdone;   // some JMPs left
4946                                 goto L3;
4947                             }
4948                             if (b == c.IEV2.Vblock)
4949                                 break;
4950                         }
4951                     }
4952 
4953                     break;
4954 
4955                 case FLcode:
4956                 {
4957                     code *cr;
4958 
4959                     disp = 0;
4960 
4961                     ct = c.IEV2.Vcode;         /* target of branch     */
4962                     assert(ct.Iflags & (CFtarg | CFtarg2));
4963                     for (cr = cn; cr; cr = code_next(cr))
4964                     {
4965                         if (cr == ct)
4966                             break;
4967                         disp += calccodsize(cr);
4968                     }
4969 
4970                     if (!cr)
4971                     {   // Didn't find it in forward search. Try backwards jump
4972                         int s = 0;
4973                         disp = 0;
4974                         for (cr = bl.Bcode; cr != cn; cr = code_next(cr))
4975                         {
4976                             assert(cr != null); // must have found it
4977                             if (cr == ct)
4978                                 s = 1;
4979                             if (s)
4980                                 disp += calccodsize(cr);
4981                         }
4982                     }
4983 
4984                     if (config.flags4 & CFG4optimized && !flag)
4985                     {
4986                         /* Propagate branch forward past junk   */
4987                         while (1)
4988                         {
4989                             if (ct.Iop == NOP ||
4990                                 ct.Iop == (ESCAPE | ESClinnum))
4991                             {
4992                                 ct = code_next(ct);
4993                                 if (!ct)
4994                                     goto L2;
4995                             }
4996                             else
4997                             {
4998                                 c.IEV2.Vcode = ct;
4999                                 ct.Iflags |= CFtarg;
5000                                 break;
5001                             }
5002                         }
5003 
5004                         /* And eliminate jmps to jmps   */
5005                         if ((op == ct.Iop || ct.Iop == JMP) &&
5006                             (op == JMP || c.Iflags & CFjmp16))
5007                         {
5008                             c.IFL2 = ct.IFL2;
5009                             c.IEV2.Vcode = ct.IEV2.Vcode;
5010                             /*printf("eliminating branch\n");*/
5011                             goto L1;
5012                         }
5013                      L2:
5014                         { }
5015                     }
5016                 }
5017                     break;
5018 
5019                 default:
5020                     goto L3;
5021             }
5022 
5023             if (disp == 0)                      // bra to next instruction
5024             {
5025                 bytesaved += csize;
5026                 c.Iop = NOP;                   // del branch instruction
5027                 c.IEV2.Vcode = null;
5028                 c = cn;
5029                 if (!c)
5030                     break;
5031                 continue;
5032             }
5033             else if (cast(targ_size_t)cast(targ_schar)(disp - 2) == (disp - 2) &&
5034                      cast(targ_size_t)cast(targ_schar)disp == disp)
5035             {
5036                 if (op == JMP)
5037                 {
5038                     c.Iop = JMPS;              // JMP SHORT
5039                     bytesaved += I16 ? 1 : 3;
5040                 }
5041                 else                            // else Jcond
5042                 {
5043                     c.Iflags &= ~CFjmp16;      // a branch is ok
5044                     bytesaved += I16 ? 3 : 4;
5045 
5046                     // Replace a cond jump around a call to a function that
5047                     // never returns with a cond jump to that function.
5048                     if (config.flags4 & CFG4optimized &&
5049                         config.target_cpu >= TARGET_80386 &&
5050                         disp == (I16 ? 3 : 5) &&
5051                         cn &&
5052                         cn.Iop == CALL &&
5053                         cn.IFL2 == FLfunc &&
5054                         cn.IEV2.Vsym.Sflags & SFLexit &&
5055                         !(cn.Iflags & (CFtarg | CFtarg2))
5056                        )
5057                     {
5058                         cn.Iop = 0x0F00 | ((c.Iop & 0x0F) ^ 0x81);
5059                         c.Iop = NOP;
5060                         c.IEV2.Vcode = null;
5061                         bytesaved++;
5062 
5063                         // If nobody else points to ct, we can remove the CFtarg
5064                         if (flag && ct)
5065                         {
5066                             code *cx;
5067                             for (cx = bl.Bcode; 1; cx = code_next(cx))
5068                             {
5069                                 if (!cx)
5070                                 {
5071                                     ct.Iflags &= ~CFtarg;
5072                                     break;
5073                                 }
5074                                 if (cx.IEV2.Vcode == ct)
5075                                     break;
5076                             }
5077                         }
5078                     }
5079                 }
5080                 csize = calccodsize(c);
5081             }
5082             else
5083                 bl.Bflags &= ~BFLjmpoptdone;   // some JMPs left
5084         }
5085 L3:
5086         if (cn)
5087         {
5088             offset += csize;
5089             c = cn;
5090         }
5091         else
5092             break;
5093     }
5094     //printf("bytesaved = x%x\n",bytesaved);
5095     return bytesaved;
5096 }
5097 
5098 
5099 /************************************************
5100  * Adjust all Soffset's of stack variables so they
5101  * are all relative to the frame pointer.
5102  */
5103 
5104 @trusted
5105 void cod3_adjSymOffsets()
5106 {
5107     SYMIDX si;
5108 
5109     //printf("cod3_adjSymOffsets()\n");
5110     for (si = 0; si < globsym.length; si++)
5111     {
5112         //printf("\tglobsym[%d] = %p\n",si,globsym[si]);
5113         Symbol *s = globsym[si];
5114 
5115         switch (s.Sclass)
5116         {
5117             case SC.parameter:
5118             case SC.regpar:
5119             case SC.shadowreg:
5120 //printf("s = '%s', Soffset = x%x, Para.size = x%x, EBPtoESP = x%x\n", s.Sident, s.Soffset, Para.size, EBPtoESP);
5121                 s.Soffset += Para.size;
5122                 if (0 && !(funcsym_p.Sfunc.Fflags3 & Fmember))
5123                 {
5124                     if (!hasframe)
5125                         s.Soffset += EBPtoESP;
5126                     if (funcsym_p.Sfunc.Fflags3 & Fnested)
5127                         s.Soffset += REGSIZE;
5128                 }
5129                 break;
5130 
5131             case SC.fastpar:
5132 //printf("\tfastpar %s %p Soffset %x Fast.size %x BPoff %x\n", s.Sident, s, cast(int)s.Soffset, cast(int)Fast.size, cast(int)BPoff);
5133                 s.Soffset += Fast.size + BPoff;
5134                 break;
5135 
5136             case SC.auto_:
5137             case SC.register:
5138                 if (s.Sfl == FLfast)
5139                     s.Soffset += Fast.size + BPoff;
5140                 else
5141 //printf("s = '%s', Soffset = x%x, Auto.size = x%x, BPoff = x%x EBPtoESP = x%x\n", s.Sident, cast(int)s.Soffset, cast(int)Auto.size, cast(int)BPoff, cast(int)EBPtoESP);
5142 //              if (!(funcsym_p.Sfunc.Fflags3 & Fnested))
5143                     s.Soffset += Auto.size + BPoff;
5144                 break;
5145 
5146             case SC.bprel:
5147                 break;
5148 
5149             default:
5150                 continue;
5151         }
5152         static if (0)
5153         {
5154             if (!hasframe)
5155                 s.Soffset += EBPtoESP;
5156         }
5157     }
5158 }
5159 
5160 /*******************************
5161  * Take symbol info in union ev and replace it with a real address
5162  * in Vpointer.
5163  */
5164 
5165 @trusted
5166 void assignaddr(block *bl)
5167 {
5168     int EBPtoESPsave = EBPtoESP;
5169     int hasframesave = hasframe;
5170 
5171     if (bl.Bflags & BFLoutsideprolog)
5172     {
5173         EBPtoESP = -REGSIZE;
5174         hasframe = 0;
5175     }
5176     assignaddrc(bl.Bcode);
5177     hasframe = hasframesave;
5178     EBPtoESP = EBPtoESPsave;
5179 }
5180 
5181 @trusted
5182 void assignaddrc(code *c)
5183 {
5184     int sn;
5185     Symbol *s;
5186     ubyte ins,rm;
5187     targ_size_t soff;
5188     targ_size_t base;
5189 
5190     base = EBPtoESP;
5191     for (; c; c = code_next(c))
5192     {
5193         debug
5194         {
5195         if (0)
5196         {       printf("assignaddrc()\n");
5197                 code_print(c);
5198         }
5199         if (code_next(c) && code_next(code_next(c)) == c)
5200             assert(0);
5201         }
5202 
5203         if (c.Iflags & CFvex && c.Ivex.pfx == 0xC4)
5204             ins = vex_inssize(c);
5205         else if ((c.Iop & 0xFFFD00) == 0x0F3800)
5206             ins = inssize2[(c.Iop >> 8) & 0xFF];
5207         else if ((c.Iop & 0xFF00) == 0x0F00)
5208             ins = inssize2[c.Iop & 0xFF];
5209         else if ((c.Iop & 0xFF) == ESCAPE)
5210         {
5211             if (c.Iop == (ESCAPE | ESCadjesp))
5212             {
5213                 //printf("adjusting EBPtoESP (%d) by %ld\n",EBPtoESP,cast(long)c.IEV1.Vint);
5214                 EBPtoESP += c.IEV1.Vint;
5215                 c.Iop = NOP;
5216             }
5217             else if (c.Iop == (ESCAPE | ESCfixesp))
5218             {
5219                 //printf("fix ESP\n");
5220                 if (hasframe)
5221                 {
5222                     // LEA ESP,-EBPtoESP[EBP]
5223                     c.Iop = LEA;
5224                     if (c.Irm & 8)
5225                         c.Irex |= REX_R;
5226                     c.Irm = modregrm(2,SP,BP);
5227                     c.Iflags = CFoff;
5228                     c.IFL1 = FLconst;
5229                     c.IEV1.Vuns = -EBPtoESP;
5230                     if (enforcealign)
5231                     {
5232                         // AND ESP, -STACKALIGN
5233                         code *cn = code_calloc();
5234                         cn.Iop = 0x81;
5235                         cn.Irm = modregrm(3, 4, SP);
5236                         cn.Iflags = CFoff;
5237                         cn.IFL2 = FLconst;
5238                         cn.IEV2.Vsize_t = -STACKALIGN;
5239                         if (I64)
5240                             c.Irex |= REX_W;
5241                         cn.next = c.next;
5242                         c.next = cn;
5243                     }
5244                 }
5245             }
5246             else if (c.Iop == (ESCAPE | ESCframeptr))
5247             {   // Convert to load of frame pointer
5248                 // c.Irm is the register to use
5249                 if (hasframe && !enforcealign)
5250                 {   // MOV reg,EBP
5251                     c.Iop = 0x89;
5252                     if (c.Irm & 8)
5253                         c.Irex |= REX_B;
5254                     c.Irm = modregrm(3,BP,c.Irm & 7);
5255                 }
5256                 else
5257                 {   // LEA reg,EBPtoESP[ESP]
5258                     c.Iop = LEA;
5259                     if (c.Irm & 8)
5260                         c.Irex |= REX_R;
5261                     c.Irm = modregrm(2,c.Irm & 7,4);
5262                     c.Isib = modregrm(0,4,SP);
5263                     c.Iflags = CFoff;
5264                     c.IFL1 = FLconst;
5265                     c.IEV1.Vuns = EBPtoESP;
5266                 }
5267             }
5268             if (I64)
5269                 c.Irex |= REX_W;
5270             continue;
5271         }
5272         else
5273             ins = inssize[c.Iop & 0xFF];
5274         if (!(ins & M) ||
5275             ((rm = c.Irm) & 0xC0) == 0xC0)
5276             goto do2;           /* if no first operand          */
5277         if (is32bitaddr(I32,c.Iflags))
5278         {
5279 
5280             if (
5281                 ((rm & 0xC0) == 0 && !((rm & 7) == 4 && (c.Isib & 7) == 5 || (rm & 7) == 5))
5282                )
5283                 goto do2;       /* if no first operand  */
5284         }
5285         else
5286         {
5287             if (
5288                 ((rm & 0xC0) == 0 && !((rm & 7) == 6))
5289                )
5290                 goto do2;       /* if no first operand  */
5291         }
5292         s = c.IEV1.Vsym;
5293         switch (c.IFL1)
5294         {
5295             case FLdata:
5296                 if (config.objfmt == OBJ_OMF && s.Sclass != SC.comdat && s.Sclass != SC.extern_)
5297                 {
5298                     c.IEV1.Vseg = s.Sseg;
5299                     c.IEV1.Vpointer += s.Soffset;
5300                     c.IFL1 = FLdatseg;
5301                 }
5302                 else
5303                     c.IFL1 = FLextern;
5304                 goto do2;
5305 
5306             case FLudata:
5307                 if (config.objfmt == OBJ_OMF)
5308                 {
5309                     c.IEV1.Vseg = s.Sseg;
5310                     c.IEV1.Vpointer += s.Soffset;
5311                     c.IFL1 = FLdatseg;
5312                 }
5313                 else
5314                     c.IFL1 = FLextern;
5315                 goto do2;
5316 
5317             case FLtlsdata:
5318                 if (config.objfmt == OBJ_ELF || config.objfmt == OBJ_MACH)
5319                     c.IFL1 = FLextern;
5320                 goto do2;
5321 
5322             case FLdatseg:
5323                 //c.IEV1.Vseg = DATA;
5324                 goto do2;
5325 
5326             case FLfardata:
5327             case FLcsdata:
5328             case FLpseudo:
5329                 goto do2;
5330 
5331             case FLstack:
5332                 //printf("Soffset = %d, EBPtoESP = %d, base = %d, pointer = %d\n",
5333                 //s.Soffset,EBPtoESP,base,c.IEV1.Vpointer);
5334                 c.IEV1.Vpointer += s.Soffset + EBPtoESP - base - EEStack.offset;
5335                 break;
5336 
5337             case FLfast:
5338                 soff = Fast.size;
5339                 goto L1;
5340 
5341             case FLreg:
5342             case FLauto:
5343                 soff = Auto.size;
5344             L1:
5345                 if (Symbol_Sisdead(*s, anyiasm))
5346                 {
5347                     c.Iop = NOP;               // remove references to it
5348                     continue;
5349                 }
5350                 if (s.Sfl == FLreg && c.IEV1.Vpointer < 2)
5351                 {
5352                     reg_t reg = s.Sreglsw;
5353 
5354                     assert(!(s.Sregm & ~mask(reg)));
5355                     if (c.IEV1.Vpointer == 1)
5356                     {
5357                         assert(reg < 4);    /* must be a BYTEREGS   */
5358                         reg |= 4;           /* convert to high byte reg */
5359                     }
5360                     if (reg & 8)
5361                     {
5362                         assert(I64);
5363                         c.Irex |= REX_B;
5364                         reg &= 7;
5365                     }
5366                     c.Irm = (c.Irm & modregrm(0,7,0))
5367                             | modregrm(3,0,reg);
5368                     assert(c.Iop != LES && c.Iop != LEA);
5369                     goto do2;
5370                 }
5371                 else
5372                 {   c.IEV1.Vpointer += s.Soffset + soff + BPoff;
5373                     if (s.Sflags & SFLunambig)
5374                         c.Iflags |= CFunambig;
5375             L2:
5376                     if (!hasframe || (enforcealign && c.IFL1 != FLpara))
5377                     {   /* Convert to ESP relative address instead of EBP */
5378                         assert(!I16);
5379                         c.IEV1.Vpointer += EBPtoESP;
5380                         ubyte crm = c.Irm;
5381                         if ((crm & 7) == 4)              // if SIB byte
5382                         {
5383                             assert((c.Isib & 7) == BP);
5384                             assert((crm & 0xC0) != 0);
5385                             c.Isib = (c.Isib & ~7) | modregrm(0,0,SP);
5386                         }
5387                         else
5388                         {
5389                             assert((crm & 7) == 5);
5390                             c.Irm = (crm & modregrm(0,7,0))
5391                                     | modregrm(2,0,4);
5392                             c.Isib = modregrm(0,4,SP);
5393                         }
5394                     }
5395                 }
5396                 break;
5397 
5398             case FLpara:
5399                 //printf("s = %s, Soffset = %d, Para.size = %d, BPoff = %d, EBPtoESP = %d, Vpointer = %d\n",
5400                 //s.Sident.ptr, cast(int)s.Soffset, cast(int)Para.size, cast(int)BPoff,
5401                 //cast(int)EBPtoESP, cast(int)c.IEV1.Vpointer);
5402                 soff = Para.size - BPoff;    // cancel out add of BPoff
5403                 goto L1;
5404 
5405             case FLfltreg:
5406                 c.IEV1.Vpointer += Foff + BPoff;
5407                 c.Iflags |= CFunambig;
5408                 goto L2;
5409 
5410             case FLallocatmp:
5411                 c.IEV1.Vpointer += Alloca.offset + BPoff;
5412                 goto L2;
5413 
5414             case FLfuncarg:
5415                 c.IEV1.Vpointer += cgstate.funcarg.offset + BPoff;
5416                 goto L2;
5417 
5418             case FLbprel:
5419                 c.IEV1.Vpointer += s.Soffset;
5420                 break;
5421 
5422             case FLcs:
5423                 sn = c.IEV1.Vuns;
5424                 if (!CSE.loaded(sn))            // if never loaded
5425                 {
5426                     c.Iop = NOP;
5427                     continue;
5428                 }
5429                 c.IEV1.Vpointer = CSE.offset(sn) + CSoff + BPoff;
5430                 c.Iflags |= CFunambig;
5431                 goto L2;
5432 
5433             case FLregsave:
5434                 sn = c.IEV1.Vuns;
5435                 c.IEV1.Vpointer = sn + regsave.off + BPoff;
5436                 c.Iflags |= CFunambig;
5437                 goto L2;
5438 
5439             case FLndp:
5440                 assert(c.IEV1.Vuns < global87.save.length);
5441                 c.IEV1.Vpointer = c.IEV1.Vuns * tysize(TYldouble) + NDPoff + BPoff;
5442                 c.Iflags |= CFunambig;
5443                 goto L2;
5444 
5445             case FLoffset:
5446                 break;
5447 
5448             case FLlocalsize:
5449                 c.IEV1.Vpointer += localsize;
5450                 break;
5451 
5452             case FLconst:
5453             default:
5454                 goto do2;
5455         }
5456         c.IFL1 = FLconst;
5457     do2:
5458         /* Ignore TEST (F6 and F7) opcodes      */
5459         if (!(ins & T)) goto done;              /* if no second operand */
5460         s = c.IEV2.Vsym;
5461         switch (c.IFL2)
5462         {
5463             case FLdata:
5464                 if (config.objfmt == OBJ_ELF || config.objfmt == OBJ_MACH)
5465                 {
5466                     c.IFL2 = FLextern;
5467                     goto do2;
5468                 }
5469                 else
5470                 {
5471                     if (s.Sclass == SC.comdat)
5472                     {   c.IFL2 = FLextern;
5473                         goto do2;
5474                     }
5475                     c.IEV2.Vseg = MARS ? s.Sseg : DATA;
5476                     c.IEV2.Vpointer += s.Soffset;
5477                     c.IFL2 = FLdatseg;
5478                     goto done;
5479                 }
5480 
5481             case FLudata:
5482                 if (config.objfmt == OBJ_ELF || config.objfmt == OBJ_MACH)
5483                 {
5484                     c.IFL2 = FLextern;
5485                     goto do2;
5486                 }
5487                 else
5488                 {
5489                     c.IEV2.Vseg = MARS ? s.Sseg : UDATA;
5490                     c.IEV2.Vpointer += s.Soffset;
5491                     c.IFL2 = FLdatseg;
5492                     goto done;
5493                 }
5494 
5495             case FLtlsdata:
5496                 if (config.objfmt == OBJ_ELF || config.objfmt == OBJ_MACH)
5497                 {
5498                     c.IFL2 = FLextern;
5499                     goto do2;
5500                 }
5501                 goto done;
5502 
5503             case FLdatseg:
5504                 //c.IEV2.Vseg = DATA;
5505                 goto done;
5506 
5507             case FLcsdata:
5508             case FLfardata:
5509                 goto done;
5510 
5511             case FLreg:
5512             case FLpseudo:
5513                 assert(0);
5514                 /* NOTREACHED */
5515 
5516             case FLfast:
5517                 c.IEV2.Vpointer += s.Soffset + Fast.size + BPoff;
5518                 break;
5519 
5520             case FLauto:
5521                 c.IEV2.Vpointer += s.Soffset + Auto.size + BPoff;
5522             L3:
5523                 if (!hasframe || (enforcealign && c.IFL2 != FLpara))
5524                     /* Convert to ESP relative address instead of EBP */
5525                     c.IEV2.Vpointer += EBPtoESP;
5526                 break;
5527 
5528             case FLpara:
5529                 c.IEV2.Vpointer += s.Soffset + Para.size;
5530                 goto L3;
5531 
5532             case FLfltreg:
5533                 c.IEV2.Vpointer += Foff + BPoff;
5534                 goto L3;
5535 
5536             case FLallocatmp:
5537                 c.IEV2.Vpointer += Alloca.offset + BPoff;
5538                 goto L3;
5539 
5540             case FLfuncarg:
5541                 c.IEV2.Vpointer += cgstate.funcarg.offset + BPoff;
5542                 goto L3;
5543 
5544             case FLbprel:
5545                 c.IEV2.Vpointer += s.Soffset;
5546                 break;
5547 
5548             case FLstack:
5549                 c.IEV2.Vpointer += s.Soffset + EBPtoESP - base;
5550                 break;
5551 
5552             case FLcs:
5553             case FLndp:
5554             case FLregsave:
5555                 assert(0);
5556 
5557             case FLconst:
5558                 break;
5559 
5560             case FLlocalsize:
5561                 c.IEV2.Vpointer += localsize;
5562                 break;
5563 
5564             default:
5565                 goto done;
5566         }
5567         c.IFL2 = FLconst;
5568   done:
5569         { }
5570     }
5571 }
5572 
5573 /*******************************
5574  * Return offset from BP of symbol s.
5575  */
5576 
5577 @trusted
5578 targ_size_t cod3_bpoffset(Symbol *s)
5579 {
5580     targ_size_t offset;
5581 
5582     symbol_debug(s);
5583     offset = s.Soffset;
5584     switch (s.Sfl)
5585     {
5586         case FLpara:
5587             offset += Para.size;
5588             break;
5589 
5590         case FLfast:
5591             offset += Fast.size + BPoff;
5592             break;
5593 
5594         case FLauto:
5595             offset += Auto.size + BPoff;
5596             break;
5597 
5598         default:
5599             WRFL(s.Sfl);
5600             symbol_print(s);
5601             assert(0);
5602     }
5603     assert(hasframe);
5604     return offset;
5605 }
5606 
5607 
5608 /*******************************
5609  * Find shorter versions of the same instructions.
5610  * Does these optimizations:
5611  *      replaces jmps to the next instruction with NOPs
5612  *      sign extension of modregrm displacement
5613  *      sign extension of immediate data (can't do it for OR, AND, XOR
5614  *              as the opcodes are not defined)
5615  *      short versions for AX EA
5616  *      short versions for reg EA
5617  * Code is neither removed nor added.
5618  * Params:
5619  *      b = block for code (or null)
5620  *      c = code list to optimize
5621  */
5622 
5623 @trusted
5624 void pinholeopt(code *c,block *b)
5625 {
5626     targ_size_t a;
5627     uint mod;
5628     ubyte ins;
5629     int usespace;
5630     int useopsize;
5631     int space;
5632     block *bn;
5633 
5634     debug
5635     {
5636         __gshared int tested; if (!tested) { tested++; pinholeopt_unittest(); }
5637     }
5638 
5639     debug
5640     {
5641         code *cstart = c;
5642         if (debugc)
5643         {
5644             printf("+pinholeopt(%p)\n",c);
5645         }
5646     }
5647 
5648     if (b)
5649     {
5650         bn = b.Bnext;
5651         usespace = (config.flags4 & CFG4space && b.BC != BCasm);
5652         useopsize = (I16 || (config.flags4 & CFG4space && b.BC != BCasm));
5653     }
5654     else
5655     {
5656         bn = null;
5657         usespace = (config.flags4 & CFG4space);
5658         useopsize = (I16 || config.flags4 & CFG4space);
5659     }
5660     for (; c; c = code_next(c))
5661     {
5662     L1:
5663         opcode_t op = c.Iop;
5664         if (c.Iflags & CFvex && c.Ivex.pfx == 0xC4)
5665             ins = vex_inssize(c);
5666         else if ((op & 0xFFFD00) == 0x0F3800)
5667             ins = inssize2[(op >> 8) & 0xFF];
5668         else if ((op & 0xFF00) == 0x0F00)
5669             ins = inssize2[op & 0xFF];
5670         else
5671             ins = inssize[op & 0xFF];
5672         if (ins & M)            // if modregrm byte
5673         {
5674             int shortop = (c.Iflags & CFopsize) ? !I16 : I16;
5675             int local_BPRM = BPRM;
5676 
5677             if (c.Iflags & CFaddrsize)
5678                 local_BPRM ^= 5 ^ 6;    // toggle between 5 and 6
5679 
5680             uint rm = c.Irm;
5681             reg_t reg = rm & modregrm(0,7,0);          // isolate reg field
5682             reg_t ereg = rm & 7;
5683             //printf("c = %p, op = %02x rm = %02x\n", c, op, rm);
5684 
5685             /* If immediate second operand      */
5686             if ((ins & T ||
5687                  ((op == 0xF6 || op == 0xF7) && (reg < modregrm(0,2,0) || reg > modregrm(0,3,0)))
5688                 ) &&
5689                 c.IFL2 == FLconst)
5690             {
5691                 int flags = c.Iflags & CFpsw;      /* if want result in flags */
5692                 targ_long u = c.IEV2.Vuns;
5693                 if (ins & E)
5694                     u = cast(byte) u;
5695                 else if (shortop)
5696                     u = cast(short) u;
5697 
5698                 // Replace CMP reg,0 with TEST reg,reg
5699                 if ((op & 0xFE) == 0x80 &&              // 80 is CMP R8,imm8; 81 is CMP reg,imm
5700                     rm >= modregrm(3,7,AX) &&
5701                     u == 0)
5702                 {
5703                     c.Iop = (op & 1) | 0x84;
5704                     c.Irm = modregrm(3,ereg,ereg);
5705                     if (c.Irex & REX_B)
5706                         c.Irex |= REX_R;
5707                     goto L1;
5708                 }
5709 
5710                 /* Optimize ANDs with an immediate constant             */
5711                 if ((op == 0x81 || op == 0x80) && reg == modregrm(0,4,0))
5712                 {
5713                     if (rm >= modregrm(3,4,AX))         // AND reg,imm
5714                     {
5715                         if (u == 0)
5716                         {
5717                             /* Replace with XOR reg,reg     */
5718                             c.Iop = 0x30 | (op & 1);
5719                             c.Irm = modregrm(3,ereg,ereg);
5720                             if (c.Irex & REX_B)
5721                                 c.Irex |= REX_R;
5722                             goto L1;
5723                         }
5724                         if (u == 0xFFFFFFFF && !flags)
5725                         {
5726                             c.Iop = NOP;
5727                             goto L1;
5728                         }
5729                     }
5730                     if (op == 0x81 && !flags)
5731                     {   // If we can do the operation in one byte
5732 
5733                         // If EA is not SI or DI
5734                         if ((rm < modregrm(3,4,SP) || I64) &&
5735                             (config.flags4 & CFG4space ||
5736                              config.target_cpu < TARGET_PentiumPro)
5737                            )
5738                         {
5739                             if ((u & 0xFFFFFF00) == 0xFFFFFF00)
5740                                 goto L2;
5741                             else if (rm < modregrm(3,0,0) || (!c.Irex && ereg < 4))
5742                             {
5743                                 if (!shortop)
5744                                 {
5745                                     if ((u & 0xFFFF00FF) == 0xFFFF00FF)
5746                                         goto L3;
5747                                 }
5748                                 else
5749                                 {
5750                                     if ((u & 0xFF) == 0xFF)
5751                                         goto L3;
5752                                 }
5753                             }
5754                         }
5755                         if (!shortop && useopsize)
5756                         {
5757                             if ((u & 0xFFFF0000) == 0xFFFF0000)
5758                             {
5759                                 c.Iflags ^= CFopsize;
5760                                 goto L1;
5761                             }
5762                             if ((u & 0xFFFF) == 0xFFFF && rm < modregrm(3,4,AX))
5763                             {
5764                                 c.IEV1.Voffset += 2; /* address MSW      */
5765                                 c.IEV2.Vuns >>= 16;
5766                                 c.Iflags ^= CFopsize;
5767                                 goto L1;
5768                             }
5769                             if (rm >= modregrm(3,4,AX))
5770                             {
5771                                 if (u == 0xFF && (rm <= modregrm(3,4,BX) || I64))
5772                                 {
5773                                     c.Iop = MOVZXb;     // MOVZX
5774                                     c.Irm = modregrm(3,ereg,ereg);
5775                                     if (c.Irex & REX_B)
5776                                         c.Irex |= REX_R;
5777                                     goto L1;
5778                                 }
5779                                 if (u == 0xFFFF)
5780                                 {
5781                                     c.Iop = MOVZXw;     // MOVZX
5782                                     c.Irm = modregrm(3,ereg,ereg);
5783                                     if (c.Irex & REX_B)
5784                                         c.Irex |= REX_R;
5785                                     goto L1;
5786                                 }
5787                             }
5788                         }
5789                     }
5790                 }
5791 
5792                 /* Look for ADD,OR,SUB,XOR with u that we can eliminate */
5793                 if (!flags &&
5794                     (op == 0x81 || op == 0x80) &&
5795                     (reg == modregrm(0,0,0) || reg == modregrm(0,1,0) ||  // ADD,OR
5796                      reg == modregrm(0,5,0) || reg == modregrm(0,6,0))    // SUB, XOR
5797                    )
5798                 {
5799                     if (u == 0)
5800                     {
5801                         c.Iop = NOP;
5802                         goto L1;
5803                     }
5804                     if (u == ~0 && reg == modregrm(0,6,0))  /* XOR  */
5805                     {
5806                         c.Iop = 0xF6 | (op & 1);       /* NOT  */
5807                         c.Irm ^= modregrm(0,6^2,0);
5808                         goto L1;
5809                     }
5810                     if (!shortop &&
5811                         useopsize &&
5812                         op == 0x81 &&
5813                         (u & 0xFFFF0000) == 0 &&
5814                         (reg == modregrm(0,6,0) || reg == modregrm(0,1,0)))
5815                     {
5816                         c.Iflags ^= CFopsize;
5817                         goto L1;
5818                     }
5819                 }
5820 
5821                 /* Look for TEST or OR or XOR with an immediate constant */
5822                 /* that we can replace with a byte operation            */
5823                 if (op == 0xF7 && reg == modregrm(0,0,0) ||
5824                     op == 0x81 && reg == modregrm(0,6,0) && !flags ||
5825                     op == 0x81 && reg == modregrm(0,1,0))
5826                 {
5827                     // See if we can replace a dword with a word
5828                     // (avoid for 32 bit instructions, because CFopsize
5829                     //  is too slow)
5830                     if (!shortop && useopsize)
5831                     {
5832                         if ((u & 0xFFFF0000) == 0)
5833                         {
5834                             c.Iflags ^= CFopsize;
5835                             goto L1;
5836                         }
5837                         /* If memory (not register) addressing mode     */
5838                         if ((u & 0xFFFF) == 0 && rm < modregrm(3,0,AX))
5839                         {
5840                             c.IEV1.Voffset += 2; /* address MSW  */
5841                             c.IEV2.Vuns >>= 16;
5842                             c.Iflags ^= CFopsize;
5843                             goto L1;
5844                         }
5845                     }
5846 
5847                     // If EA is not SI or DI
5848                     if (rm < (modregrm(3,0,SP) | reg) &&
5849                         (usespace ||
5850                          config.target_cpu < TARGET_PentiumPro)
5851                        )
5852                     {
5853                         if ((u & 0xFFFFFF00) == 0)
5854                         {
5855                         L2: c.Iop--;           /* to byte instruction  */
5856                             c.Iflags &= ~CFopsize;
5857                             goto L1;
5858                         }
5859                         if (((u & 0xFFFF00FF) == 0 ||
5860                              (shortop && (u & 0xFF) == 0)) &&
5861                             (rm < modregrm(3,0,0) || (!c.Irex && ereg < 4)))
5862                         {
5863                         L3:
5864                             c.IEV2.Vuns >>= 8;
5865                             if (rm >= (modregrm(3,0,AX) | reg))
5866                                 c.Irm |= 4;    /* AX.AH, BX.BH, etc. */
5867                             else
5868                                 c.IEV1.Voffset += 1;
5869                             goto L2;
5870                         }
5871                     }
5872 
5873                     // BUG: which is right?
5874                     //else if ((u & 0xFFFF0000) == 0)
5875 
5876                     else if (0 && op == 0xF7 &&
5877                              rm >= modregrm(3,0,SP) &&
5878                              (u & 0xFFFF0000) == 0)
5879 
5880                         c.Iflags &= ~CFopsize;
5881                 }
5882 
5883                 // Try to replace TEST reg,-1 with TEST reg,reg
5884                 if (op == 0xF6 && rm >= modregrm(3,0,AX) && rm <= modregrm(3,0,7)) // TEST regL,immed8
5885                 {
5886                     if ((u & 0xFF) == 0xFF)
5887                     {
5888                       L4:
5889                         c.Iop = 0x84;          // TEST regL,regL
5890                         c.Irm = modregrm(3,ereg,ereg);
5891                         if (c.Irex & REX_B)
5892                             c.Irex |= REX_R;
5893                         c.Iflags &= ~CFopsize;
5894                         goto L1;
5895                     }
5896                 }
5897                 if (op == 0xF7 && rm >= modregrm(3,0,AX) && rm <= modregrm(3,0,7) && (I64 || ereg < 4))
5898                 {
5899                     if (u == 0xFF)
5900                     {
5901                         if (ereg & 4) // SIL,DIL,BPL,SPL need REX prefix
5902                             c.Irex |= REX;
5903                         goto L4;
5904                     }
5905                     if ((u & 0xFFFF) == 0xFF00 && shortop && !c.Irex && ereg < 4)
5906                     {
5907                         ereg |= 4;                /* to regH      */
5908                         goto L4;
5909                     }
5910                 }
5911 
5912                 /* Look for sign extended immediate data */
5913                 if (cast(byte) u == u)
5914                 {
5915                     if (op == 0x81)
5916                     {
5917                         if (reg != 0x08 && reg != 0x20 && reg != 0x30)
5918                             c.Iop = op = 0x83;         /* 8 bit sgn ext */
5919                     }
5920                     else if (op == 0x69)                /* IMUL rw,ew,dw */
5921                         c.Iop = op = 0x6B;             /* IMUL rw,ew,db */
5922                 }
5923 
5924                 // Look for SHIFT EA,imm8 we can replace with short form
5925                 if (u == 1 && ((op & 0xFE) == 0xC0))
5926                     c.Iop |= 0xD0;
5927 
5928             } /* if immediate second operand */
5929 
5930             /* Look for AX short form */
5931             if (ins & A)
5932             {
5933                 if (rm == modregrm(0,AX,local_BPRM) &&
5934                     !(c.Irex & REX_R) &&               // and it's AX, not R8
5935                     (op & ~3) == 0x88 &&
5936                     !I64)
5937                 {
5938                     op = ((op & 3) + 0xA0) ^ 2;
5939                     /* 8A. A0 */
5940                     /* 8B. A1 */
5941                     /* 88. A2 */
5942                     /* 89. A3 */
5943                     c.Iop = op;
5944                     c.IFL2 = c.IFL1;
5945                     c.IEV2 = c.IEV1;
5946                 }
5947 
5948                 /* Replace MOV REG1,REG2 with MOV EREG1,EREG2   */
5949                 else if (!I16 &&
5950                          (op == 0x89 || op == 0x8B) &&
5951                          (rm & 0xC0) == 0xC0 &&
5952                          (!b || b.BC != BCasm)
5953                         )
5954                     c.Iflags &= ~CFopsize;
5955 
5956                 // If rm is AX
5957                 else if ((rm & modregrm(3,0,7)) == modregrm(3,0,AX) && !(c.Irex & (REX_R | REX_B)))
5958                 {
5959                     switch (op)
5960                     {
5961                         case 0x80:  op = reg | 4; break;
5962                         case 0x81:  op = reg | 5; break;
5963                         case 0x87:  op = 0x90 + (reg>>3); break;    // XCHG
5964 
5965                         case 0xF6:
5966                             if (reg == 0)
5967                                 op = 0xA8;  /* TEST AL,immed8       */
5968                             break;
5969 
5970                         case 0xF7:
5971                             if (reg == 0)
5972                                 op = 0xA9;  /* TEST AX,immed16      */
5973                             break;
5974 
5975                         default:
5976                             break;
5977                     }
5978                     c.Iop = op;
5979                 }
5980             }
5981 
5982             /* Look for reg short form */
5983             if ((ins & R) && (rm & 0xC0) == 0xC0)
5984             {
5985                 switch (op)
5986                 {
5987                     case 0xC6:  op = 0xB0 + ereg; break;
5988                     case 0xC7: // if no sign extension
5989                         if (!(c.Irex & REX_W && c.IEV2.Vint < 0))
5990                         {
5991                             c.Irm = 0;
5992                             c.Irex &= ~REX_W;
5993                             op = 0xB8 + ereg;
5994                         }
5995                         break;
5996 
5997                     case 0xFF:
5998                         switch (reg)
5999                         {   case 6<<3: op = 0x50+ereg; break;/* PUSH*/
6000                             case 0<<3: if (!I64) op = 0x40+ereg; break; /* INC*/
6001                             case 1<<3: if (!I64) op = 0x48+ereg; break; /* DEC*/
6002                             default: break;
6003                         }
6004                         break;
6005 
6006                     case 0x8F:  op = 0x58 + ereg; break;
6007                     case 0x87:
6008                         if (reg == 0 && !(c.Irex & (REX_R | REX_B))) // Issue 12968: Needed to ensure it's referencing RAX, not R8
6009                             op = 0x90 + ereg;
6010                         break;
6011 
6012                     default:
6013                         break;
6014                 }
6015                 c.Iop = op;
6016             }
6017 
6018             // Look to remove redundant REX prefix on XOR
6019             if (c.Irex == REX_W // ignore ops involving R8..R15
6020                 && (op == 0x31 || op == 0x33) // XOR
6021                 && ((rm & 0xC0) == 0xC0) // register direct
6022                 && ((reg >> 3) == ereg)) // register with itself
6023             {
6024                 c.Irex = 0;
6025             }
6026 
6027             // Look to replace SHL reg,1 with ADD reg,reg
6028             if ((op & ~1) == 0xD0 &&
6029                      (rm & modregrm(3,7,0)) == modregrm(3,4,0) &&
6030                      config.target_cpu >= TARGET_80486)
6031             {
6032                 c.Iop &= 1;
6033                 c.Irm = cast(ubyte)((rm & modregrm(3,0,7)) | (ereg << 3));
6034                 if (c.Irex & REX_B)
6035                     c.Irex |= REX_R;
6036                 if (!(c.Iflags & CFpsw) && !I16)
6037                     c.Iflags &= ~CFopsize;
6038                 goto L1;
6039             }
6040 
6041             /* Look for sign extended modregrm displacement, or 0
6042              * displacement.
6043              */
6044 
6045             if (((rm & 0xC0) == 0x80) && // it's a 16/32 bit disp
6046                 c.IFL1 == FLconst)      // and it's a constant
6047             {
6048                 a = c.IEV1.Vpointer;
6049                 if (a == 0 && (rm & 7) != local_BPRM &&         // if 0[disp]
6050                     !(local_BPRM == 5 && (rm & 7) == 4 && (c.Isib & 7) == BP)
6051                    )
6052                     c.Irm &= 0x3F;
6053                 else if (!I16)
6054                 {
6055                     if (cast(targ_size_t)cast(targ_schar)a == a)
6056                         c.Irm ^= 0xC0;                 /* do 8 sx      */
6057                 }
6058                 else if ((cast(targ_size_t)cast(targ_schar)a & 0xFFFF) == (a & 0xFFFF))
6059                     c.Irm ^= 0xC0;                     /* do 8 sx      */
6060             }
6061 
6062             /* Look for LEA reg,[ireg], replace with MOV reg,ireg       */
6063             if (op == LEA)
6064             {
6065                 rm = c.Irm & 7;
6066                 mod = c.Irm & modregrm(3,0,0);
6067                 if (mod == 0)
6068                 {
6069                     if (!I16)
6070                     {
6071                         switch (rm)
6072                         {
6073                             case 4:
6074                             case 5:
6075                                 break;
6076 
6077                             default:
6078                                 c.Irm |= modregrm(3,0,0);
6079                                 c.Iop = 0x8B;
6080                                 break;
6081                         }
6082                     }
6083                     else
6084                     {
6085                         switch (rm)
6086                         {
6087                             case 4:     rm = modregrm(3,0,SI);  goto L6;
6088                             case 5:     rm = modregrm(3,0,DI);  goto L6;
6089                             case 7:     rm = modregrm(3,0,BX);  goto L6;
6090                             L6:     c.Irm = cast(ubyte)(rm + reg);
6091                                     c.Iop = 0x8B;
6092                                     break;
6093 
6094                             default:
6095                                     break;
6096                         }
6097                     }
6098                 }
6099 
6100                 /* replace LEA reg,0[BP] with MOV reg,BP        */
6101                 else if (mod == modregrm(1,0,0) && rm == local_BPRM &&
6102                         c.IFL1 == FLconst && c.IEV1.Vpointer == 0)
6103                 {
6104                     c.Iop = 0x8B;          /* MOV reg,BP   */
6105                     c.Irm = cast(ubyte)(modregrm(3,0,BP) + reg);
6106                 }
6107             }
6108 
6109             // Replace [R13] with 0[R13]
6110             if (c.Irex & REX_B && ((c.Irm & modregrm(3,0,7)) == modregrm(0,0,BP) ||
6111                                     issib(c.Irm) && (c.Irm & modregrm(3,0,0)) == 0 && (c.Isib & 7) == BP))
6112             {
6113                 c.Irm |= modregrm(1,0,0);
6114                 c.IFL1 = FLconst;
6115                 c.IEV1.Vpointer = 0;
6116             }
6117         }
6118         else if (!(c.Iflags & CFvex))
6119         {
6120             switch (op)
6121             {
6122                 default:
6123                     // Look for MOV r64, immediate
6124                     if ((c.Irex & REX_W) && (op & ~7) == 0xB8)
6125                     {
6126                         /* Look for zero extended immediate data */
6127                         if (c.IEV2.Vsize_t == c.IEV2.Vuns)
6128                         {
6129                             c.Irex &= ~REX_W;
6130                         }
6131                         /* Look for sign extended immediate data */
6132                         else if (c.IEV2.Vsize_t == c.IEV2.Vint)
6133                         {
6134                             c.Irm = modregrm(3,0,op & 7);
6135                             c.Iop = op = 0xC7;
6136                             c.IEV2.Vsize_t = c.IEV2.Vuns;
6137                         }
6138                     }
6139                     if ((op & ~0x0F) != 0x70)
6140                         break;
6141                     goto case JMP;
6142 
6143                 case JMP:
6144                     switch (c.IFL2)
6145                     {
6146                         case FLcode:
6147                             if (c.IEV2.Vcode == code_next(c))
6148                             {
6149                                 c.Iop = NOP;
6150                                 continue;
6151                             }
6152                             break;
6153 
6154                         case FLblock:
6155                             if (!code_next(c) && c.IEV2.Vblock == bn)
6156                             {
6157                                 c.Iop = NOP;
6158                                 continue;
6159                             }
6160                             break;
6161 
6162                         case FLconst:
6163                         case FLfunc:
6164                         case FLextern:
6165                             break;
6166 
6167                         default:
6168                             WRFL(c.IFL2);
6169                             assert(0);
6170                     }
6171                     break;
6172 
6173                 case 0x68:                      // PUSH immed16
6174                     if (c.IFL2 == FLconst)
6175                     {
6176                         targ_long u = c.IEV2.Vuns;
6177                         if (I64 ||
6178                             ((c.Iflags & CFopsize) ? I16 : I32))
6179                         {   // PUSH 32/64 bit operand
6180                             if (u == cast(byte) u)
6181                                 c.Iop = 0x6A;          // PUSH immed8
6182                         }
6183                         else // PUSH 16 bit operand
6184                         {
6185                             if (cast(short)u == cast(byte) u)
6186                                 c.Iop = 0x6A;          // PUSH immed8
6187                         }
6188                     }
6189                     break;
6190             }
6191         }
6192     }
6193 
6194     debug
6195     if (debugc)
6196     {
6197         printf("-pinholeopt(%p)\n",cstart);
6198         for (c = cstart; c; c = code_next(c))
6199             code_print(c);
6200     }
6201 }
6202 
6203 
6204 debug
6205 {
6206 @trusted
6207 private void pinholeopt_unittest()
6208 {
6209     //printf("pinholeopt_unittest()\n");
6210     static struct CS
6211     {
6212         uint model,op,ea;
6213         targ_size_t ev1,ev2;
6214         uint flags;
6215     }
6216     __gshared CS[2][22] tests =
6217     [
6218         // XOR reg,immed                            NOT regL
6219         [ { 16,0x81,modregrm(3,6,BX),0,0xFF,0 },    { 0,0xF6,modregrm(3,2,BX),0,0xFF } ],
6220 
6221         // MOV 0[BX],3                               MOV [BX],3
6222         [ { 16,0xC7,modregrm(2,0,7),0,3 },          { 0,0xC7,modregrm(0,0,7),0,3 } ],
6223 
6224 /+      // only if config.flags4 & CFG4space
6225         // TEST regL,immed8
6226         [ { 0,0xF6,modregrm(3,0,BX),0,0xFF,0 },    { 0,0x84,modregrm(3,BX,BX),0,0xFF }],
6227         [ { 0,0xF7,modregrm(3,0,BX),0,0xFF,0 },    { 0,0x84,modregrm(3,BX,BX),0,0xFF }],
6228         [ { 64,0xF6,modregrmx(3,0,R8),0,0xFF,0 },  { 0,0x84,modregxrmx(3,R8,R8),0,0xFF }],
6229         [ { 64,0xF7,modregrmx(3,0,R8),0,0xFF,0 },  { 0,0x84,modregxrmx(3,R8,R8),0,0xFF }],
6230 +/
6231 
6232         // PUSH immed => PUSH immed8
6233         [ { 0,0x68,0,0,0 },    { 0,0x6A,0,0,0 }],
6234         [ { 0,0x68,0,0,0x7F }, { 0,0x6A,0,0,0x7F }],
6235         [ { 0,0x68,0,0,0x80 }, { 0,0x68,0,0,0x80 }],
6236         [ { 16,0x68,0,0,0,CFopsize },    { 0,0x6A,0,0,0,CFopsize }],
6237         [ { 16,0x68,0,0,0x7F,CFopsize }, { 0,0x6A,0,0,0x7F,CFopsize }],
6238         [ { 16,0x68,0,0,0x80,CFopsize }, { 0,0x68,0,0,0x80,CFopsize }],
6239         [ { 16,0x68,0,0,0x10000,0 },     { 0,0x6A,0,0,0x10000,0 }],
6240         [ { 16,0x68,0,0,0x10000,CFopsize }, { 0,0x68,0,0,0x10000,CFopsize }],
6241         [ { 32,0x68,0,0,0,CFopsize },    { 0,0x6A,0,0,0,CFopsize }],
6242         [ { 32,0x68,0,0,0x7F,CFopsize }, { 0,0x6A,0,0,0x7F,CFopsize }],
6243         [ { 32,0x68,0,0,0x80,CFopsize }, { 0,0x68,0,0,0x80,CFopsize }],
6244         [ { 32,0x68,0,0,0x10000,CFopsize },    { 0,0x6A,0,0,0x10000,CFopsize }],
6245         [ { 32,0x68,0,0,0x8000,CFopsize }, { 0,0x68,0,0,0x8000,CFopsize }],
6246 
6247         // clear r64, for r64 != R8..R15
6248         [ { 64,0x31,0x800C0,0,0,0 }, { 0,0x31,0xC0,0,0,0}],
6249         [ { 64,0x33,0x800C0,0,0,0 }, { 0,0x33,0xC0,0,0,0}],
6250 
6251         // MOV r64, immed
6252         [ { 64,0xC7,0x800C0,0,0xFFFFFFFF,0 }, { 0,0xC7,0x800C0,0,0xFFFFFFFF,0}],
6253         [ { 64,0xC7,0x800C0,0,0x7FFFFFFF,0 }, { 0,0xB8,0,0,0x7FFFFFFF,0}],
6254         [ { 64,0xB8,0x80000,0,0xFFFFFFFF,0 }, { 0,0xB8,0,0,0xFFFFFFFF,0 }],
6255         [ { 64,0xB8,0x80000,0,cast(targ_size_t)0x1FFFFFFFF,0 }, { 0,0xB8,0x80000,0,cast(targ_size_t)0x1FFFFFFFF,0 }],
6256         [ { 64,0xB8,0x80000,0,cast(targ_size_t)0xFFFFFFFFFFFFFFFF,0 }, { 0,0xC7,0x800C0,0,cast(targ_size_t)0xFFFFFFFF,0}],
6257     ];
6258 
6259     //config.flags4 |= CFG4space;
6260     for (int i = 0; i < tests.length; i++)
6261     {   CS *pin  = &tests[i][0];
6262         CS *pout = &tests[i][1];
6263         code cs = void;
6264         memset(&cs, 0, cs.sizeof);
6265         if (pin.model)
6266         {
6267             if (I16 && pin.model != 16)
6268                 continue;
6269             if (I32 && pin.model != 32)
6270                 continue;
6271             if (I64 && pin.model != 64)
6272                 continue;
6273         }
6274         //printf("[%d]\n", i);
6275         cs.Iop = pin.op;
6276         cs.Iea = pin.ea;
6277         cs.IFL1 = FLconst;
6278         cs.IFL2 = FLconst;
6279         cs.IEV1.Vsize_t = pin.ev1;
6280         cs.IEV2.Vsize_t = pin.ev2;
6281         cs.Iflags = pin.flags;
6282         pinholeopt(&cs, null);
6283         if (cs.Iop != pout.op)
6284         {   printf("[%d] Iop = x%02x, pout = x%02x\n", i, cs.Iop, pout.op);
6285             assert(0);
6286         }
6287         assert(cs.Iea == pout.ea);
6288         assert(cs.IEV1.Vsize_t == pout.ev1);
6289         assert(cs.IEV2.Vsize_t == pout.ev2);
6290         assert(cs.Iflags == pout.flags);
6291     }
6292 }
6293 }
6294 
6295 @trusted
6296 void simplify_code(code* c)
6297 {
6298     reg_t reg;
6299     if (config.flags4 & CFG4optimized &&
6300         (c.Iop == 0x81 || c.Iop == 0x80) &&
6301         c.IFL2 == FLconst &&
6302         reghasvalue((c.Iop == 0x80) ? BYTEREGS : ALLREGS,I64 ? c.IEV2.Vsize_t : c.IEV2.Vlong,reg) &&
6303         !(I16 && c.Iflags & CFopsize)
6304        )
6305     {
6306         // See if we can replace immediate instruction with register instruction
6307         static immutable ubyte[8] regop =
6308                 [ 0x00,0x08,0x10,0x18,0x20,0x28,0x30,0x38 ];
6309 
6310         //printf("replacing 0x%02x, val = x%lx\n",c.Iop,c.IEV2.Vlong);
6311         c.Iop = regop[(c.Irm & modregrm(0,7,0)) >> 3] | (c.Iop & 1);
6312         code_newreg(c, reg);
6313         if (I64 && !(c.Iop & 1) && (reg & 4))
6314             c.Irex |= REX;
6315     }
6316 }
6317 
6318 /**************************
6319  * Compute jump addresses for FLcode.
6320  * Note: only works for forward referenced code.
6321  *       only direct jumps and branches are detected.
6322  *       LOOP instructions only work for backward refs.
6323  */
6324 
6325 @trusted
6326 void jmpaddr(code *c)
6327 {
6328     code* ci,cn,ctarg,cstart;
6329     targ_size_t ad;
6330 
6331     //printf("jmpaddr()\n");
6332     cstart = c;                           /* remember start of code       */
6333     while (c)
6334     {
6335         const op = c.Iop;
6336         if (op <= 0xEB &&
6337             inssize[op] & T &&   // if second operand
6338             c.IFL2 == FLcode &&
6339             ((op & ~0x0F) == 0x70 || op == JMP || op == JMPS || op == JCXZ || op == CALL))
6340         {
6341             ci = code_next(c);
6342             ctarg = c.IEV2.Vcode;  /* target code                  */
6343             ad = 0;                 /* IP displacement              */
6344             while (ci && ci != ctarg)
6345             {
6346                 ad += calccodsize(ci);
6347                 ci = code_next(ci);
6348             }
6349             if (!ci)
6350                 goto Lbackjmp;      // couldn't find it
6351             if (!I16 || op == JMP || op == JMPS || op == JCXZ || op == CALL)
6352                 c.IEV2.Vpointer = ad;
6353             else                    /* else conditional             */
6354             {
6355                 if (!(c.Iflags & CFjmp16))     /* if branch    */
6356                     c.IEV2.Vpointer = ad;
6357                 else            /* branch around a long jump    */
6358                 {
6359                     cn = code_next(c);
6360                     c.next = code_calloc();
6361                     code_next(c).next = cn;
6362                     c.Iop = op ^ 1;        /* converse jmp */
6363                     c.Iflags &= ~CFjmp16;
6364                     c.IEV2.Vpointer = I16 ? 3 : 5;
6365                     cn = code_next(c);
6366                     cn.Iop = JMP;          /* long jump    */
6367                     cn.IFL2 = FLconst;
6368                     cn.IEV2.Vpointer = ad;
6369                 }
6370             }
6371             c.IFL2 = FLconst;
6372         }
6373         if (op == LOOP && c.IFL2 == FLcode)    /* backwards refs       */
6374         {
6375           Lbackjmp:
6376             ctarg = c.IEV2.Vcode;
6377             for (ci = cstart; ci != ctarg; ci = code_next(ci))
6378                 if (!ci || ci == c)
6379                     assert(0);
6380             ad = 2;                 /* - IP displacement            */
6381             while (ci != c)
6382             {
6383                 assert(ci);
6384                 ad += calccodsize(ci);
6385                 ci = code_next(ci);
6386             }
6387             c.IEV2.Vpointer = (-ad) & 0xFF;
6388             c.IFL2 = FLconst;
6389         }
6390         c = code_next(c);
6391     }
6392 }
6393 
6394 /*******************************
6395  * Calculate bl.Bsize.
6396  */
6397 
6398 uint calcblksize(code *c)
6399 {
6400     uint size;
6401     for (size = 0; c; c = code_next(c))
6402     {
6403         uint sz = calccodsize(c);
6404         //printf("off=%02x, sz = %d, code %p: op=%02x\n", size, sz, c, c.Iop);
6405         size += sz;
6406     }
6407     //printf("calcblksize(c = x%x) = %d\n", c, size);
6408     return size;
6409 }
6410 
6411 /*****************************
6412  * Calculate and return code size of a code.
6413  * Note that NOPs are sometimes used as markers, but are
6414  * never output. LINNUMs are never output.
6415  * Note: This routine must be fast. Profiling shows it is significant.
6416  */
6417 
6418 @trusted
6419 uint calccodsize(code *c)
6420 {
6421     uint size;
6422     ubyte rm,mod,ins;
6423     uint iflags;
6424     uint i32 = I32 || I64;
6425     uint a32 = i32;
6426 
6427     debug
6428     assert((a32 & ~1) == 0);
6429 
6430     iflags = c.Iflags;
6431     opcode_t op = c.Iop;
6432     //printf("calccodsize(x%08x), Iflags = x%x\n", op, iflags);
6433     if (iflags & CFvex && c.Ivex.pfx == 0xC4)
6434     {
6435         ins = vex_inssize(c);
6436         size = ins & 7;
6437         goto Lmodrm;
6438     }
6439     else if ((op & 0xFF00) == 0x0F00 || (op & 0xFFFD00) == 0x0F3800)
6440         op = 0x0F;
6441     else
6442         op &= 0xFF;
6443     switch (op)
6444     {
6445         case 0x0F:
6446             if ((c.Iop & 0xFFFD00) == 0x0F3800)
6447             {   // 3 byte op ( 0F38-- or 0F3A-- )
6448                 ins = inssize2[(c.Iop >> 8) & 0xFF];
6449                 size = ins & 7;
6450                 if (c.Iop & 0xFF000000)
6451                   size++;
6452             }
6453             else
6454             {   // 2 byte op ( 0F-- )
6455                 ins = inssize2[c.Iop & 0xFF];
6456                 size = ins & 7;
6457                 if (c.Iop & 0xFF0000)
6458                   size++;
6459             }
6460             break;
6461 
6462         case 0x90:
6463             size = (c.Iop == PAUSE) ? 2 : 1;
6464             goto Lret2;
6465 
6466         case NOP:
6467         case ESCAPE:
6468             size = 0;                   // since these won't be output
6469             goto Lret2;
6470 
6471         case ASM:
6472             if (c.Iflags == CFaddrsize)        // kludge for DA inline asm
6473                 size = _tysize[TYnptr];
6474             else
6475                 size = cast(uint)c.IEV1.len;
6476             goto Lret2;
6477 
6478         case 0xA1:
6479         case 0xA3:
6480             if (c.Irex)
6481             {
6482                 size = 9;               // 64 bit immediate value for MOV to/from RAX
6483                 goto Lret;
6484             }
6485             goto default;
6486 
6487         case 0xF6:                      /* TEST mem8,immed8             */
6488             ins = inssize[op];
6489             size = ins & 7;
6490             if (i32)
6491                 size = inssize32[op];
6492             if ((c.Irm & (7<<3)) == 0)
6493                 size++;                 /* size of immed8               */
6494             break;
6495 
6496         case 0xF7:
6497             ins = inssize[op];
6498             size = ins & 7;
6499             if (i32)
6500                 size = inssize32[op];
6501             if ((c.Irm & (7<<3)) == 0)
6502                 size += (i32 ^ ((iflags & CFopsize) !=0)) ? 4 : 2;
6503             break;
6504 
6505         case 0xFA:
6506         case 0xFB:
6507             if (c.Iop == ENDBR32 || c.Iop == ENDBR64)
6508             {
6509                 size = 4;
6510                 break;
6511             }
6512             goto default;
6513 
6514         default:
6515             ins = inssize[op];
6516             size = ins & 7;
6517             if (i32)
6518                 size = inssize32[op];
6519     }
6520 
6521     if (iflags & (CFwait | CFopsize | CFaddrsize | CFSEG))
6522     {
6523         if (iflags & CFwait)    // if add FWAIT prefix
6524             size++;
6525         if (iflags & CFSEG)     // if segment override
6526             size++;
6527 
6528         // If the instruction has a second operand that is not an 8 bit,
6529         // and the operand size prefix is present, then fix the size computation
6530         // because the operand size will be different.
6531         // Walter, I had problems with this bit at the end.  There can still be
6532         // an ADDRSIZE prefix for these and it does indeed change the operand size.
6533 
6534         if (iflags & (CFopsize | CFaddrsize))
6535         {
6536             if ((ins & (T|E)) == T)
6537             {
6538                 if ((op & 0xAC) == 0xA0)
6539                 {
6540                     if (iflags & CFaddrsize && !I64)
6541                     {   if (I32)
6542                             size -= 2;
6543                         else
6544                             size += 2;
6545                     }
6546                 }
6547                 else if (iflags & CFopsize)
6548                 {   if (I16)
6549                         size += 2;
6550                     else
6551                         size -= 2;
6552                 }
6553             }
6554             if (iflags & CFaddrsize)
6555             {   if (!I64)
6556                     a32 ^= 1;
6557                 size++;
6558             }
6559             if (iflags & CFopsize)
6560                 size++;                         /* +1 for OPSIZE prefix         */
6561         }
6562     }
6563 
6564 Lmodrm:
6565     if ((op & ~0x0F) == 0x70)
6566     {
6567         if (iflags & CFjmp16)           // if long branch
6568             size += I16 ? 3 : 4;        // + 3(4) bytes for JMP
6569     }
6570     else if (ins & M)                   // if modregrm byte
6571     {
6572         rm = c.Irm;
6573         mod = rm & 0xC0;
6574         if (a32 || I64)
6575         {   // 32 bit addressing
6576             if (issib(rm))
6577                 size++;
6578             switch (mod)
6579             {   case 0:
6580                     if (issib(rm) && (c.Isib & 7) == 5 ||
6581                         (rm & 7) == 5)
6582                         size += 4;      /* disp32                       */
6583                     if (c.Irex & REX_B && (rm & 7) == 5)
6584                         /* Instead of selecting R13, this mode is an [RIP] relative
6585                          * address. Although valid, it's redundant, and should not
6586                          * be generated. Instead, generate 0[R13] instead of [R13].
6587                          */
6588                         assert(0);
6589                     break;
6590 
6591                 case 0x40:
6592                     size++;             /* disp8                        */
6593                     break;
6594 
6595                 case 0x80:
6596                     size += 4;          /* disp32                       */
6597                     break;
6598 
6599                 default:
6600                     break;
6601             }
6602         }
6603         else
6604         {   // 16 bit addressing
6605             if (mod == 0x40)            /* 01: 8 bit displacement       */
6606                 size++;
6607             else if (mod == 0x80 || (mod == 0 && (rm & 7) == 6))
6608                 size += 2;
6609         }
6610     }
6611 
6612 Lret:
6613     if (!(iflags & CFvex) && c.Irex)
6614     {
6615         size++;
6616         if (c.Irex & REX_W && (op & ~7) == 0xB8)
6617             size += 4;
6618     }
6619 Lret2:
6620     //printf("op = x%02x, size = %d\n",op,size);
6621     return size;
6622 }
6623 
6624 /********************************
6625  * Return !=0 if codes match.
6626  */
6627 
6628 static if (0)
6629 {
6630 
6631 int code_match(code *c1,code *c2)
6632 {
6633     code cs1,cs2;
6634     ubyte ins;
6635 
6636     if (c1 == c2)
6637         goto match;
6638     cs1 = *c1;
6639     cs2 = *c2;
6640     if (cs1.Iop != cs2.Iop)
6641         goto nomatch;
6642     switch (cs1.Iop)
6643     {
6644         case ESCAPE | ESCctor:
6645         case ESCAPE | ESCdtor:
6646             goto nomatch;
6647 
6648         case NOP:
6649             goto match;
6650 
6651         case ASM:
6652             if (cs1.IEV1.len == cs2.IEV1.len &&
6653                 memcmp(cs1.IEV1.bytes,cs2.IEV1.bytes,cs1.EV1.len) == 0)
6654                 goto match;
6655             else
6656                 goto nomatch;
6657 
6658         default:
6659             if ((cs1.Iop & 0xFF) == ESCAPE)
6660                 goto match;
6661             break;
6662     }
6663     if (cs1.Iflags != cs2.Iflags)
6664         goto nomatch;
6665 
6666     ins = inssize[cs1.Iop & 0xFF];
6667     if ((cs1.Iop & 0xFFFD00) == 0x0F3800)
6668     {
6669         ins = inssize2[(cs1.Iop >> 8) & 0xFF];
6670     }
6671     else if ((cs1.Iop & 0xFF00) == 0x0F00)
6672     {
6673         ins = inssize2[cs1.Iop & 0xFF];
6674     }
6675 
6676     if (ins & M)                // if modregrm byte
6677     {
6678         if (cs1.Irm != cs2.Irm)
6679             goto nomatch;
6680         if ((cs1.Irm & 0xC0) == 0xC0)
6681             goto do2;
6682         if (is32bitaddr(I32,cs1.Iflags))
6683         {
6684             if (issib(cs1.Irm) && cs1.Isib != cs2.Isib)
6685                 goto nomatch;
6686             if (
6687                 ((rm & 0xC0) == 0 && !((rm & 7) == 4 && (c.Isib & 7) == 5 || (rm & 7) == 5))
6688                )
6689                 goto do2;       /* if no first operand  */
6690         }
6691         else
6692         {
6693             if (
6694                 ((rm & 0xC0) == 0 && !((rm & 7) == 6))
6695                )
6696                 goto do2;       /* if no first operand  */
6697         }
6698         if (cs1.IFL1 != cs2.IFL1)
6699             goto nomatch;
6700         if (flinsymtab[cs1.IFL1] && cs1.IEV1.Vsym != cs2.IEV1.Vsym)
6701             goto nomatch;
6702         if (cs1.IEV1.Voffset != cs2.IEV1.Voffset)
6703             goto nomatch;
6704     }
6705 
6706 do2:
6707     if (!(ins & T))                     // if no second operand
6708         goto match;
6709     if (cs1.IFL2 != cs2.IFL2)
6710         goto nomatch;
6711     if (flinsymtab[cs1.IFL2] && cs1.IEV2.Vsym != cs2.IEV2.Vsym)
6712         goto nomatch;
6713     if (cs1.IEV2.Voffset != cs2.IEV2.Voffset)
6714         goto nomatch;
6715 
6716 match:
6717     return 1;
6718 
6719 nomatch:
6720     return 0;
6721 }
6722 
6723 }
6724 
6725 /************************
6726  * Little buffer allocated on the stack to accumulate instruction bytes to
6727  * later be sent along to objmod
6728  */
6729 private struct MiniCodeBuf
6730 {
6731 nothrow:
6732     uint index;
6733     uint offset;
6734     int seg;
6735     Barray!ubyte* disasmBuf;
6736     ubyte[256] bytes; // = void;
6737 
6738     @trusted
6739     this(int seg)
6740     {
6741         index = 0;
6742         this.offset = cast(uint)Offset(seg);
6743         this.seg = seg;
6744     }
6745 
6746     @trusted
6747     void flushx()
6748     {
6749         // Emit accumulated bytes to code segment
6750         debug assert(index < bytes.length);
6751 
6752         if (disasmBuf)                     // write to buffer for disassembly
6753         {
6754             foreach (c; bytes[0 .. index]) // not efficient, but for verbose output anyway
6755                 disasmBuf.push(c);
6756         }
6757 
6758         offset += objmod.bytes(seg, offset, index, bytes.ptr);
6759         index = 0;
6760     }
6761 
6762     @trusted
6763     void gen(ubyte c) { bytes[index++] = c; }
6764 
6765     @trusted
6766     void genp(uint n, void *p) { memcpy(&bytes[index], p, n); index += n; }
6767 
6768     @trusted
6769     void flush() { if (index) flushx(); }
6770 
6771     @trusted
6772     uint getOffset() { return offset + index; }
6773 
6774     @trusted
6775     uint available() { return cast(uint)bytes.length - index; }
6776 
6777     /******************************
6778      * write64/write32/write16 write `value` to `disasmBuf`
6779      */
6780     @trusted
6781     void write64(ulong value)
6782     {
6783         if (disasmBuf)
6784         {
6785             disasmBuf.push(cast(ubyte)value);
6786             disasmBuf.push(cast(ubyte)(value >>  8));
6787             disasmBuf.push(cast(ubyte)(value >> 16));
6788             disasmBuf.push(cast(ubyte)(value >> 24));
6789             disasmBuf.push(cast(ubyte)(value >> 32));
6790             disasmBuf.push(cast(ubyte)(value >> 36));
6791             disasmBuf.push(cast(ubyte)(value >> 40));
6792             disasmBuf.push(cast(ubyte)(value >> 44));
6793         }
6794     }
6795 
6796     pragma(inline, true)
6797     @trusted
6798     void write32(uint value)
6799     {
6800         if (disasmBuf)
6801         {
6802             disasmBuf.push(cast(ubyte)value);
6803             disasmBuf.push(cast(ubyte)(value >>  8));
6804             disasmBuf.push(cast(ubyte)(value >> 16));
6805             disasmBuf.push(cast(ubyte)(value >> 24));
6806         }
6807     }
6808 
6809     pragma(inline, true)
6810     @trusted
6811     void write16(uint value)
6812     {
6813         if (disasmBuf)
6814         {
6815             disasmBuf.push(cast(ubyte)value);
6816             disasmBuf.push(cast(ubyte)(value >> 8));
6817         }
6818     }
6819 }
6820 
6821 /**************************
6822  * Convert instructions to object code and write them to objmod.
6823  * Params:
6824  *      seg = code segment to write to, code starts at Offset(seg)
6825  *      c = list of instructions to write
6826  *      disasmBuf = if not null, then also write object code here
6827  * Returns:
6828  *      offset of end of code emitted
6829  */
6830 
6831 @trusted
6832 uint codout(int seg, code *c, Barray!ubyte* disasmBuf)
6833 {
6834     ubyte rm,mod;
6835     ubyte ins;
6836     code *cn;
6837     uint flags;
6838     Symbol *s;
6839 
6840     debug
6841     if (debugc) printf("codout(%p), Coffset = x%llx\n",c,cast(ulong)Offset(seg));
6842 
6843     MiniCodeBuf ggen = void;
6844     ggen.index = 0;
6845     ggen.offset = cast(uint)Offset(seg);
6846     ggen.seg = seg;
6847     ggen.disasmBuf = disasmBuf;
6848 
6849     for (; c; c = code_next(c))
6850     {
6851         debug
6852         {
6853         if (debugc) { printf("off=%02x, sz=%d, ",cast(int)ggen.getOffset(),cast(int)calccodsize(c)); code_print(c); }
6854         uint startoffset = ggen.getOffset();
6855         }
6856 
6857         opcode_t op = c.Iop;
6858         ins = inssize[op & 0xFF];
6859         switch (op & 0xFF)
6860         {
6861             case ESCAPE:
6862                 /* Check for SSE4 opcode v/pmaxuw xmm1,xmm2/m128 */
6863                 if(op == 0x660F383E || c.Iflags & CFvex) break;
6864 
6865                 switch (op & 0xFFFF00)
6866                 {   case ESClinnum:
6867                         /* put out line number stuff    */
6868                         objmod.linnum(c.IEV1.Vsrcpos,seg,ggen.getOffset());
6869                         break;
6870                     case ESCadjesp:
6871                         //printf("adjust ESP %ld\n", cast(long)c.IEV1.Vint);
6872                         break;
6873 
6874                     default:
6875                         break;
6876                 }
6877 
6878                 debug
6879                 assert(calccodsize(c) == 0);
6880 
6881                 continue;
6882 
6883             case NOP:                   /* don't send them out          */
6884                 if (op != NOP)
6885                     break;
6886                 debug
6887                 assert(calccodsize(c) == 0);
6888 
6889                 continue;
6890 
6891             case ASM:
6892                 if (op != ASM)
6893                     break;
6894                 ggen.flush();
6895                 if (c.Iflags == CFaddrsize)    // kludge for DA inline asm
6896                 {
6897                     do32bit(ggen, FLblockoff,c.IEV1,0,0);
6898                 }
6899                 else
6900                 {
6901                     ggen.offset += objmod.bytes(seg,ggen.offset,cast(uint)c.IEV1.len,c.IEV1.bytes);
6902                 }
6903                 debug
6904                 assert(calccodsize(c) == c.IEV1.len);
6905 
6906                 continue;
6907 
6908             default:
6909                 break;
6910         }
6911         flags = c.Iflags;
6912 
6913         // See if we need to flush (don't have room for largest code sequence)
6914         if (ggen.available() < (1+4+4+8+8))
6915             ggen.flush();
6916 
6917         // see if we need to put out prefix bytes
6918         if (flags & (CFwait | CFPREFIX | CFjmp16))
6919         {
6920             int override_;
6921 
6922             if (flags & CFwait)
6923                 ggen.gen(0x9B);                      // FWAIT
6924                                                 /* ? SEGES : SEGSS      */
6925             switch (flags & CFSEG)
6926             {   case CFes:      override_ = SEGES;       goto segover;
6927                 case CFss:      override_ = SEGSS;       goto segover;
6928                 case CFcs:      override_ = SEGCS;       goto segover;
6929                 case CFds:      override_ = SEGDS;       goto segover;
6930                 case CFfs:      override_ = SEGFS;       goto segover;
6931                 case CFgs:      override_ = SEGGS;       goto segover;
6932                 segover:        ggen.gen(cast(ubyte)override_);
6933                                 break;
6934 
6935                 default:        break;
6936             }
6937 
6938             if (flags & CFaddrsize)
6939                 ggen.gen(0x67);
6940 
6941             // Do this last because of instructions like ADDPD
6942             if (flags & CFopsize)
6943                 ggen.gen(0x66);                      /* operand size         */
6944 
6945             if ((op & ~0x0F) == 0x70 && flags & CFjmp16) /* long condit jmp */
6946             {
6947                 if (!I16)
6948                 {   // Put out 16 bit conditional jump
6949                     c.Iop = op = 0x0F00 | (0x80 | (op & 0x0F));
6950                 }
6951                 else
6952                 {
6953                     cn = code_calloc();
6954                     /*cxcalloc++;*/
6955                     cn.next = code_next(c);
6956                     c.next= cn;          // link into code
6957                     cn.Iop = JMP;              // JMP block
6958                     cn.IFL2 = c.IFL2;
6959                     cn.IEV2.Vblock = c.IEV2.Vblock;
6960                     c.Iop = op ^= 1;           // toggle condition
6961                     c.IFL2 = FLconst;
6962                     c.IEV2.Vpointer = I16 ? 3 : 5; // skip over JMP block
6963                     c.Iflags &= ~CFjmp16;
6964                 }
6965             }
6966         }
6967 
6968         if (flags & CFvex)
6969         {
6970             if (flags & CFvex3)
6971             {
6972                 ggen.gen(0xC4);
6973                 ggen.gen(cast(ubyte)VEX3_B1(c.Ivex));
6974                 ggen.gen(cast(ubyte)VEX3_B2(c.Ivex));
6975                 ggen.gen(c.Ivex.op);
6976             }
6977             else
6978             {
6979                 ggen.gen(0xC5);
6980                 ggen.gen(cast(ubyte)VEX2_B1(c.Ivex));
6981                 ggen.gen(c.Ivex.op);
6982             }
6983             ins = vex_inssize(c);
6984             goto Lmodrm;
6985         }
6986 
6987         if (op > 0xFF)
6988         {
6989             if ((op & 0xFFFD00) == 0x0F3800)
6990                 ins = inssize2[(op >> 8) & 0xFF];
6991             else if ((op & 0xFF00) == 0x0F00)
6992                 ins = inssize2[op & 0xFF];
6993 
6994             if (op & 0xFF_00_00_00)
6995             {
6996                 ubyte op1 = op >> 24;
6997                 if (op1 == 0xF2 || op1 == 0xF3 || op1 == 0x66)
6998                 {
6999                     ggen.gen(op1);
7000                     if (c.Irex)
7001                         ggen.gen(c.Irex | REX);
7002                 }
7003                 else
7004                 {
7005                     if (c.Irex)
7006                         ggen.gen(c.Irex | REX);
7007                     ggen.gen(op1);
7008                 }
7009                 ggen.gen((op >> 16) & 0xFF);
7010                 ggen.gen((op >> 8) & 0xFF);
7011                 ggen.gen(op & 0xFF);
7012             }
7013             else if (op & 0xFF0000)
7014             {
7015                 ubyte op1 = cast(ubyte)(op >> 16);
7016                 if (op1 == 0xF2 || op1 == 0xF3 || op1 == 0x66)
7017                 {
7018                     ggen.gen(op1);
7019                     if (c.Irex)
7020                         ggen.gen(c.Irex | REX);
7021                 }
7022                 else
7023                 {
7024                     if (c.Irex)
7025                         ggen.gen(c.Irex | REX);
7026                     ggen.gen(op1);
7027                 }
7028                 ggen.gen((op >> 8) & 0xFF);
7029                 ggen.gen(op & 0xFF);
7030             }
7031             else
7032             {
7033                 if (c.Irex)
7034                     ggen.gen(c.Irex | REX);
7035                 ggen.gen((op >> 8) & 0xFF);
7036                 ggen.gen(op & 0xFF);
7037             }
7038         }
7039         else
7040         {
7041             if (c.Irex)
7042                 ggen.gen(c.Irex | REX);
7043             ggen.gen(cast(ubyte)op);
7044         }
7045   Lmodrm:
7046         if (ins & M)            /* if modregrm byte             */
7047         {
7048             rm = c.Irm;
7049             ggen.gen(rm);
7050 
7051             // Look for an address size override when working with the
7052             // MOD R/M and SIB bytes
7053 
7054             if (is32bitaddr( I32, flags))
7055             {
7056                 if (issib(rm))
7057                     ggen.gen(c.Isib);
7058                 switch (rm & 0xC0)
7059                 {
7060                     case 0x40:
7061                         do8bit(ggen, cast(FL) c.IFL1,c.IEV1);     // 8 bit
7062                         break;
7063 
7064                     case 0:
7065                         if (!(issib(rm) && (c.Isib & 7) == 5 ||
7066                               (rm & 7) == 5))
7067                             break;
7068                         goto case 0x80;
7069 
7070                     case 0x80:
7071                     {
7072                         int cfflags = CFoff;
7073                         targ_size_t val = 0;
7074                         if (I64)
7075                         {
7076                             if ((rm & modregrm(3,0,7)) == modregrm(0,0,5))      // if disp32[RIP]
7077                             {
7078                                 cfflags |= CFpc32;
7079                                 val = -4;
7080                                 reg_t reg = rm & modregrm(0,7,0);
7081                                 if (ins & T ||
7082                                     ((op == 0xF6 || op == 0xF7) && (reg == modregrm(0,0,0) || reg == modregrm(0,1,0))))
7083                                 {   if (ins & E || op == 0xF6)
7084                                         val = -5;
7085                                     else if (c.Iflags & CFopsize)
7086                                         val = -6;
7087                                     else
7088                                         val = -8;
7089                                 }
7090 
7091                                 if (config.exe & (EX_OSX64 | EX_WIN64))
7092                                     /* Mach-O and Win64 fixups already take the 4 byte size
7093                                      * into account, so bias by 4
7094                                      */
7095                                     val += 4;
7096                             }
7097                         }
7098                         do32bit(ggen, cast(FL)c.IFL1,c.IEV1,cfflags,cast(int)val);
7099                         break;
7100                     }
7101 
7102                     default:
7103                         break;
7104                 }
7105             }
7106             else
7107             {
7108                 switch (rm & 0xC0)
7109                 {   case 0x40:
7110                         do8bit(ggen, cast(FL) c.IFL1,c.IEV1);     // 8 bit
7111                         break;
7112 
7113                     case 0:
7114                         if ((rm & 7) != 6)
7115                             break;
7116                         goto case 0x80;
7117 
7118                     case 0x80:
7119                         do16bit(ggen, cast(FL)c.IFL1,c.IEV1,CFoff);
7120                         break;
7121 
7122                     default:
7123                         break;
7124                 }
7125             }
7126         }
7127         else
7128         {
7129             if (op == ENTER)
7130                 do16bit(ggen, cast(FL)c.IFL1,c.IEV1,0);
7131         }
7132         flags &= CFseg | CFoff | CFselfrel;
7133         if (ins & T)                    /* if second operand            */
7134         {
7135             if (ins & E)            /* if data-8                    */
7136                 do8bit(ggen, cast(FL) c.IFL2,c.IEV2);
7137             else if (!I16)
7138             {
7139                 switch (op)
7140                 {
7141                     case 0xC2:              /* RETN imm16           */
7142                     case 0xCA:              /* RETF imm16           */
7143                     do16:
7144                         do16bit(ggen, cast(FL)c.IFL2,c.IEV2,flags);
7145                         break;
7146 
7147                     case 0xA1:
7148                     case 0xA3:
7149                         if (I64 && c.Irex)
7150                         {
7151                     do64:
7152                             do64bit(ggen, cast(FL)c.IFL2,c.IEV2,flags);
7153                             break;
7154                         }
7155                         goto case 0xA0;
7156 
7157                     case 0xA0:              /* MOV AL,byte ptr []   */
7158                     case 0xA2:
7159                         if (c.Iflags & CFaddrsize && !I64)
7160                             goto do16;
7161                         else
7162                     do32:
7163                             do32bit(ggen, cast(FL)c.IFL2,c.IEV2,flags,0);
7164                         break;
7165 
7166                     case 0x9A:
7167                     case 0xEA:
7168                         if (c.Iflags & CFopsize)
7169                             goto ptr1616;
7170                         else
7171                             goto ptr1632;
7172 
7173                     case 0x68:              // PUSH immed32
7174                         if (c.IFL2 == FLblock)
7175                         {
7176                             c.IFL2 = FLblockoff;
7177                             goto do32;
7178                         }
7179                         else
7180                             goto case_default;
7181 
7182                     case CALL:              // CALL rel
7183                     case JMP:               // JMP  rel
7184                         flags |= CFselfrel;
7185                         goto case_default;
7186 
7187                     default:
7188                         if ((op|0xF) == 0x0F8F) // Jcc rel16 rel32
7189                             flags |= CFselfrel;
7190                         if (I64 && (op & ~7) == 0xB8 && c.Irex & REX_W)
7191                             goto do64;
7192                     case_default:
7193                         if (c.Iflags & CFopsize)
7194                             goto do16;
7195                         else
7196                             goto do32;
7197                 }
7198             }
7199             else
7200             {
7201                 switch (op)
7202                 {
7203                     case 0xC2:
7204                     case 0xCA:
7205                         goto do16;
7206 
7207                     case 0xA0:
7208                     case 0xA1:
7209                     case 0xA2:
7210                     case 0xA3:
7211                         if (c.Iflags & CFaddrsize)
7212                             goto do32;
7213                         else
7214                             goto do16;
7215 
7216                     case 0x9A:
7217                     case 0xEA:
7218                         if (c.Iflags & CFopsize)
7219                             goto ptr1632;
7220                         else
7221                             goto ptr1616;
7222 
7223                     ptr1616:
7224                     ptr1632:
7225                         //assert(c.IFL2 == FLfunc);
7226                         ggen.flush();
7227                         if (c.IFL2 == FLdatseg)
7228                         {
7229                             objmod.reftodatseg(seg,ggen.offset,c.IEV2.Vpointer,
7230                                     c.IEV2.Vseg,flags);
7231                             ggen.offset += 4;
7232                         }
7233                         else
7234                         {
7235                             s = c.IEV2.Vsym;
7236                             ggen.offset += objmod.reftoident(seg,ggen.offset,s,0,flags);
7237                         }
7238                         break;
7239 
7240                     case 0x68:              // PUSH immed16
7241                         if (c.IFL2 == FLblock)
7242                         {   c.IFL2 = FLblockoff;
7243                             goto do16;
7244                         }
7245                         else
7246                             goto case_default16;
7247 
7248                     case CALL:
7249                     case JMP:
7250                         flags |= CFselfrel;
7251                         goto default;
7252 
7253                     default:
7254                     case_default16:
7255                         if (c.Iflags & CFopsize)
7256                             goto do32;
7257                         else
7258                             goto do16;
7259                 }
7260             }
7261         }
7262         else if (op == 0xF6)            /* TEST mem8,immed8             */
7263         {
7264             if ((rm & (7<<3)) == 0)
7265                 do8bit(ggen, cast(FL)c.IFL2,c.IEV2);
7266         }
7267         else if (op == 0xF7)
7268         {
7269             if ((rm & (7<<3)) == 0)     /* TEST mem16/32,immed16/32     */
7270             {
7271                 if ((I32 || I64) ^ ((c.Iflags & CFopsize) != 0))
7272                     do32bit(ggen, cast(FL)c.IFL2,c.IEV2,flags,0);
7273                 else
7274                     do16bit(ggen, cast(FL)c.IFL2,c.IEV2,flags);
7275             }
7276         }
7277 
7278         debug
7279         if (ggen.getOffset() - startoffset != calccodsize(c))
7280         {
7281             printf("actual: %d, calc: %d\n", cast(int)(ggen.getOffset() - startoffset), cast(int)calccodsize(c));
7282             code_print(c);
7283             assert(0);
7284         }
7285     }
7286     ggen.flush();
7287     Offset(seg) = ggen.offset;
7288     //printf("-codout(), Coffset = x%x\n", Offset(seg));
7289     return cast(uint)ggen.offset;                      /* ending address               */
7290 }
7291 
7292 
7293 @trusted
7294 private void do64bit(ref MiniCodeBuf pbuf, FL fl, ref evc uev,int flags)
7295 {
7296     char *p;
7297     Symbol *s;
7298     targ_size_t ad;
7299 
7300     assert(I64);
7301     switch (fl)
7302     {
7303         case FLconst:
7304             ad = *cast(targ_size_t *) &uev;
7305         L1:
7306             pbuf.genp(8,&ad);
7307             return;
7308 
7309         case FLdatseg:
7310             pbuf.flush();
7311             pbuf.write64(uev.Vpointer);
7312             objmod.reftodatseg(pbuf.seg,pbuf.offset,uev.Vpointer,uev.Vseg,CFoffset64 | flags);
7313             break;
7314 
7315         case FLframehandler:
7316             framehandleroffset = pbuf.getOffset();
7317             ad = 0;
7318             goto L1;
7319 
7320         case FLswitch:
7321             pbuf.flush();
7322             ad = uev.Vswitch.Btableoffset;
7323             pbuf.write64(ad);
7324             if (config.flags & CFGromable)
7325                     objmod.reftocodeseg(pbuf.seg,pbuf.offset,ad);
7326             else
7327                     objmod.reftodatseg(pbuf.seg,pbuf.offset,ad,objmod.jmpTableSegment(funcsym_p),CFoff);
7328             break;
7329 
7330         case FLcsdata:
7331         case FLfardata:
7332             //symbol_print(uev.Vsym);
7333             // NOTE: In ELFOBJ all symbol refs have been tagged FLextern
7334             // strings and statics are treated like offsets from a
7335             // un-named external with is the start of .rodata or .data
7336         case FLextern:                      /* external data symbol         */
7337         case FLtlsdata:
7338             pbuf.flush();
7339             s = uev.Vsym;               /* symbol pointer               */
7340             pbuf.write64(uev.Voffset);
7341             objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset,CFoffset64 | flags);
7342             break;
7343 
7344         case FLgotoff:
7345             if (config.exe & (EX_OSX | EX_OSX64))
7346             {
7347                 assert(0);
7348             }
7349             else if (config.exe & EX_posix)
7350             {
7351                 pbuf.flush();
7352                 s = uev.Vsym;               /* symbol pointer               */
7353                 pbuf.write64(uev.Voffset);
7354                 objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset,CFoffset64 | flags);
7355                 break;
7356             }
7357             else
7358                 assert(0);
7359 
7360         case FLgot:
7361             if (config.exe & (EX_OSX | EX_OSX64))
7362             {
7363                 funcsym_p.Slocalgotoffset = pbuf.getOffset();
7364                 ad = 0;
7365                 goto L1;
7366             }
7367             else if (config.exe & EX_posix)
7368             {
7369                 pbuf.flush();
7370                 s = uev.Vsym;               /* symbol pointer               */
7371                 pbuf.write64(uev.Voffset);
7372                 objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset,CFoffset64 | flags);
7373                 break;
7374             }
7375             else
7376                 assert(0);
7377 
7378         case FLfunc:                        /* function call                */
7379             s = uev.Vsym;               /* symbol pointer               */
7380             assert(TARGET_SEGMENTED || !tyfarfunc(s.ty()));
7381             pbuf.flush();
7382             pbuf.write64(0);
7383             objmod.reftoident(pbuf.seg,pbuf.offset,s,0,CFoffset64 | flags);
7384             break;
7385 
7386         case FLblock:                       /* displacement to another block */
7387             ad = uev.Vblock.Boffset - pbuf.getOffset() - 4;
7388             //printf("FLblock: funcoffset = %x, pbuf.getOffset = %x, Boffset = %x, ad = %x\n", funcoffset, pbuf.getOffset(), uev.Vblock.Boffset, ad);
7389             goto L1;
7390 
7391         case FLblockoff:
7392             pbuf.flush();
7393             assert(uev.Vblock);
7394             //printf("FLblockoff: offset = %x, Boffset = %x, funcoffset = %x\n", pbuf.offset, uev.Vblock.Boffset, funcoffset);
7395             pbuf.write64(uev.Vblock.Boffset);
7396             objmod.reftocodeseg(pbuf.seg,pbuf.offset,uev.Vblock.Boffset);
7397             break;
7398 
7399         default:
7400             WRFL(fl);
7401             assert(0);
7402     }
7403     pbuf.offset += 8;
7404 }
7405 
7406 
7407 @trusted
7408 private void do32bit(ref MiniCodeBuf pbuf, FL fl, ref evc uev,int flags, int val)
7409 {
7410     char *p;
7411     Symbol *s;
7412     targ_size_t ad;
7413 
7414     //printf("do32bit(flags = x%x)\n", flags);
7415     switch (fl)
7416     {
7417         case FLconst:
7418             assert(targ_size_t.sizeof == 4 || targ_size_t.sizeof == 8);
7419             ad = * cast(targ_size_t *) &uev;
7420         L1:
7421             pbuf.genp(4,&ad);
7422             return;
7423 
7424         case FLdatseg:
7425             pbuf.flush();
7426             objmod.reftodatseg(pbuf.seg,pbuf.offset,uev.Vpointer,uev.Vseg,flags);
7427             pbuf.write32(cast(uint)uev.Vpointer);
7428             break;
7429 
7430         case FLframehandler:
7431             framehandleroffset = pbuf.getOffset();
7432             ad = 0;
7433             goto L1;
7434 
7435         case FLswitch:
7436             pbuf.flush();
7437             ad = uev.Vswitch.Btableoffset;
7438             if (config.flags & CFGromable)
7439             {
7440                 if (config.exe & (EX_OSX | EX_OSX64))
7441                 {
7442                     // These are magic values based on the exact code generated for the switch jump
7443                     if (I64)
7444                         uev.Vswitch.Btablebase = pbuf.getOffset() + 4;
7445                     else
7446                         uev.Vswitch.Btablebase = pbuf.getOffset() + 4 - 8;
7447                     ad -= uev.Vswitch.Btablebase;
7448                     goto L1;
7449                 }
7450                 else if (config.exe & EX_windos)
7451                 {
7452                     if (I64)
7453                     {
7454                         uev.Vswitch.Btablebase = pbuf.getOffset() + 4;
7455                         ad -= uev.Vswitch.Btablebase;
7456                         goto L1;
7457                     }
7458                     else
7459                         objmod.reftocodeseg(pbuf.seg,pbuf.offset,ad);
7460                 }
7461                 else
7462                 {
7463                     objmod.reftocodeseg(pbuf.seg,pbuf.offset,ad);
7464                 }
7465             }
7466             else
7467                     objmod.reftodatseg(pbuf.seg,pbuf.offset,ad,objmod.jmpTableSegment(funcsym_p),CFoff);
7468             pbuf.write32(cast(uint)ad);
7469             break;
7470 
7471         case FLcode:
7472             //assert(JMPJMPTABLE);            // the only use case
7473             pbuf.flush();
7474             ad = *cast(targ_size_t *) &uev + pbuf.getOffset();
7475             objmod.reftocodeseg(pbuf.seg,pbuf.offset,ad);
7476             pbuf.write32(cast(uint)ad);
7477             break;
7478 
7479         case FLcsdata:
7480         case FLfardata:
7481             //symbol_print(uev.Vsym);
7482 
7483             // NOTE: In ELFOBJ all symbol refs have been tagged FLextern
7484             // strings and statics are treated like offsets from a
7485             // un-named external with is the start of .rodata or .data
7486         case FLextern:                      /* external data symbol         */
7487         case FLtlsdata:
7488             pbuf.flush();
7489             s = uev.Vsym;               /* symbol pointer               */
7490             if (config.exe & EX_windos && I64 && (flags & CFpc32))
7491             {
7492                 /* This is for those funky fixups where the location to be fixed up
7493                  * is a 'val' amount back from the current RIP, biased by adding 4.
7494                  */
7495                 assert(val >= -5 && val <= 0);
7496                 flags |= (-val & 7) << 24;          // set CFREL value
7497                 assert(CFREL == (7 << 24));
7498                 objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset,flags);
7499                 pbuf.write32(cast(uint)uev.Voffset);
7500             }
7501             else
7502             {
7503                 objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset + val,flags);
7504                 pbuf.write32(cast(uint)(uev.Voffset + val));
7505             }
7506             break;
7507 
7508         case FLgotoff:
7509             if (config.exe & (EX_OSX | EX_OSX64))
7510             {
7511                 assert(0);
7512             }
7513             else if (config.exe & EX_posix)
7514             {
7515                 pbuf.flush();
7516                 s = uev.Vsym;               /* symbol pointer               */
7517                 objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset + val,flags);
7518                 pbuf.write32(cast(uint)(uev.Voffset + val));
7519                 break;
7520             }
7521             else
7522                 assert(0);
7523 
7524         case FLgot:
7525             if (config.exe & (EX_OSX | EX_OSX64))
7526             {
7527                 funcsym_p.Slocalgotoffset = pbuf.getOffset();
7528                 ad = 0;
7529                 goto L1;
7530             }
7531             else if (config.exe & EX_posix)
7532             {
7533                 pbuf.flush();
7534                 s = uev.Vsym;               /* symbol pointer               */
7535                 objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset + val,flags);
7536                 pbuf.write32(cast(uint)(uev.Voffset + val));
7537                 break;
7538             }
7539             else
7540                 assert(0);
7541 
7542         case FLfunc:                        /* function call                */
7543             s = uev.Vsym;               /* symbol pointer               */
7544             if (tyfarfunc(s.ty()))
7545             {   /* Large code references are always absolute    */
7546                 pbuf.flush();
7547                 pbuf.offset += objmod.reftoident(pbuf.seg,pbuf.offset,s,0,flags) - 4;
7548                 pbuf.write32(0);
7549             }
7550             else if (s.Sseg == pbuf.seg &&
7551                      (s.Sclass == SC.static_ || s.Sclass == SC.global) &&
7552                      s.Sxtrnnum == 0 && flags & CFselfrel)
7553             {   /* if we know it's relative address     */
7554                 ad = s.Soffset - pbuf.getOffset() - 4;
7555                 goto L1;
7556             }
7557             else
7558             {
7559                 assert(TARGET_SEGMENTED || !tyfarfunc(s.ty()));
7560                 pbuf.flush();
7561                 objmod.reftoident(pbuf.seg,pbuf.offset,s,val,flags);
7562                 pbuf.write32(cast(uint)(val));
7563             }
7564             break;
7565 
7566         case FLblock:                       /* displacement to another block */
7567             ad = uev.Vblock.Boffset - pbuf.getOffset() - 4;
7568             //printf("FLblock: funcoffset = %x, pbuf.getOffset = %x, Boffset = %x, ad = %x\n", funcoffset, pbuf.getOffset(), uev.Vblock.Boffset, ad);
7569             goto L1;
7570 
7571         case FLblockoff:
7572             pbuf.flush();
7573             assert(uev.Vblock);
7574             //printf("FLblockoff: offset = %x, Boffset = %x, funcoffset = %x\n", pbuf.offset, uev.Vblock.Boffset, funcoffset);
7575             objmod.reftocodeseg(pbuf.seg,pbuf.offset,uev.Vblock.Boffset);
7576             pbuf.write32(cast(uint)(uev.Vblock.Boffset));
7577             break;
7578 
7579         default:
7580             WRFL(fl);
7581             assert(0);
7582     }
7583     pbuf.offset += 4;
7584 }
7585 
7586 
7587 @trusted
7588 private void do16bit(ref MiniCodeBuf pbuf, FL fl, ref evc uev,int flags)
7589 {
7590     char *p;
7591     Symbol *s;
7592     targ_size_t ad;
7593 
7594     switch (fl)
7595     {
7596         case FLconst:
7597             pbuf.genp(2,cast(char *) &uev);
7598             return;
7599 
7600         case FLdatseg:
7601             pbuf.flush();
7602             objmod.reftodatseg(pbuf.seg,pbuf.offset,uev.Vpointer,uev.Vseg,flags);
7603             pbuf.write16(cast(uint)uev.Vpointer);
7604             break;
7605 
7606         case FLswitch:
7607             pbuf.flush();
7608             ad = uev.Vswitch.Btableoffset;
7609             if (config.flags & CFGromable)
7610                 objmod.reftocodeseg(pbuf.seg,pbuf.offset,ad);
7611             else
7612                 objmod.reftodatseg(pbuf.seg,pbuf.offset,ad,objmod.jmpTableSegment(funcsym_p),CFoff);
7613             pbuf.write16(cast(uint)ad);
7614             break;
7615 
7616         case FLcsdata:
7617         case FLfardata:
7618         case FLextern:                      /* external data symbol         */
7619         case FLtlsdata:
7620             //assert(SIXTEENBIT || TARGET_SEGMENTED);
7621             pbuf.flush();
7622             s = uev.Vsym;               /* symbol pointer               */
7623             objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset,flags);
7624             pbuf.write16(cast(uint)uev.Voffset);
7625             break;
7626 
7627         case FLfunc:                        /* function call                */
7628             //assert(SIXTEENBIT || TARGET_SEGMENTED);
7629             s = uev.Vsym;               /* symbol pointer               */
7630             if (tyfarfunc(s.ty()))
7631             {   /* Large code references are always absolute    */
7632                 pbuf.flush();
7633                 pbuf.offset += objmod.reftoident(pbuf.seg,pbuf.offset,s,0,flags) - 2;
7634             }
7635             else if (s.Sseg == pbuf.seg &&
7636                      (s.Sclass == SC.static_ || s.Sclass == SC.global) &&
7637                      s.Sxtrnnum == 0 && flags & CFselfrel)
7638             {   /* if we know it's relative address     */
7639                 ad = s.Soffset - pbuf.getOffset() - 2;
7640                 goto L1;
7641             }
7642             else
7643             {
7644                 pbuf.flush();
7645                 objmod.reftoident(pbuf.seg,pbuf.offset,s,0,flags);
7646             }
7647             pbuf.write16(0);
7648             break;
7649 
7650         case FLblock:                       /* displacement to another block */
7651             ad = uev.Vblock.Boffset - pbuf.getOffset() - 2;
7652             debug
7653             {
7654                 targ_ptrdiff_t delta = uev.Vblock.Boffset - pbuf.getOffset() - 2;
7655                 assert(cast(short)delta == delta);
7656             }
7657         L1:
7658             pbuf.genp(2,&ad);                    // displacement
7659             return;
7660 
7661         case FLblockoff:
7662             pbuf.flush();
7663             objmod.reftocodeseg(pbuf.seg,pbuf.offset,uev.Vblock.Boffset);
7664             pbuf.write16(cast(uint)uev.Vblock.Boffset);
7665             break;
7666 
7667         default:
7668             WRFL(fl);
7669             assert(0);
7670     }
7671     pbuf.offset += 2;
7672 }
7673 
7674 
7675 @trusted
7676 private void do8bit(ref MiniCodeBuf pbuf, FL fl, ref evc uev)
7677 {
7678     ubyte c;
7679 
7680     switch (fl)
7681     {
7682         case FLconst:
7683             c = cast(ubyte)uev.Vuns;
7684             break;
7685 
7686         case FLblock:
7687             targ_ptrdiff_t delta = uev.Vblock.Boffset - pbuf.getOffset() - 1;
7688             if (cast(byte)delta != delta)
7689             {
7690                 if (uev.Vblock.Bsrcpos.Slinnum)
7691                     printf("%s(%d): ", uev.Vblock.Bsrcpos.Sfilename, uev.Vblock.Bsrcpos.Slinnum);
7692                 printf("block displacement of %lld exceeds the maximum offset of -128 to 127.\n", cast(long)delta);
7693                 err_exit();
7694             }
7695             c = cast(ubyte)delta;
7696             debug assert(uev.Vblock.Boffset > pbuf.getOffset() || c != 0x7F);
7697             break;
7698 
7699         default:
7700             debug printf("fl = %d\n",fl);
7701             assert(0);
7702     }
7703     pbuf.gen(c);
7704 }
7705 
7706 
7707 /***************************
7708  * Debug code to dump code structure.
7709  */
7710 
7711 void WRcodlst(code *c)
7712 {
7713     for (; c; c = code_next(c))
7714         code_print(c);
7715 }
7716 
7717 @trusted
7718 extern (C) void code_print(scope code* c)
7719 {
7720     ubyte ins;
7721     ubyte rexb;
7722 
7723     if (c == null)
7724     {
7725         printf("code 0\n");
7726         return;
7727     }
7728 
7729     const op = c.Iop;
7730     if (c.Iflags & CFvex && c.Ivex.pfx == 0xC4)
7731         ins = vex_inssize(c);
7732     else if ((c.Iop & 0xFFFD00) == 0x0F3800)
7733         ins = inssize2[(op >> 8) & 0xFF];
7734     else if ((c.Iop & 0xFF00) == 0x0F00)
7735         ins = inssize2[op & 0xFF];
7736     else
7737         ins = inssize[op & 0xFF];
7738 
7739     printf("code %p: nxt=%p ",c,code_next(c));
7740 
7741     if (c.Iflags & CFvex)
7742     {
7743         if (c.Iflags & CFvex3)
7744         {
7745             printf("vex=0xC4");
7746             printf(" 0x%02X", VEX3_B1(c.Ivex));
7747             printf(" 0x%02X", VEX3_B2(c.Ivex));
7748             rexb =
7749                 ( c.Ivex.w ? REX_W : 0) |
7750                 (!c.Ivex.r ? REX_R : 0) |
7751                 (!c.Ivex.x ? REX_X : 0) |
7752                 (!c.Ivex.b ? REX_B : 0);
7753         }
7754         else
7755         {
7756             printf("vex=0xC5");
7757             printf(" 0x%02X", VEX2_B1(c.Ivex));
7758             rexb = !c.Ivex.r ? REX_R : 0;
7759         }
7760         printf(" ");
7761     }
7762     else
7763         rexb = c.Irex;
7764 
7765     if (rexb)
7766     {
7767         printf("rex=0x%02X ", c.Irex);
7768         if (rexb & REX_W)
7769             printf("W");
7770         if (rexb & REX_R)
7771             printf("R");
7772         if (rexb & REX_X)
7773             printf("X");
7774         if (rexb & REX_B)
7775             printf("B");
7776         printf(" ");
7777     }
7778     printf("op=0x%02X",op);
7779 
7780     if ((op & 0xFF) == ESCAPE)
7781     {
7782         if ((op & 0xFF00) == ESClinnum)
7783         {
7784             printf(" linnum = %d\n",c.IEV1.Vsrcpos.Slinnum);
7785             return;
7786         }
7787         printf(" ESCAPE %d",c.Iop >> 8);
7788     }
7789     if (c.Iflags)
7790         printf(" flg=%x",c.Iflags);
7791     if (ins & M)
7792     {
7793         uint rm = c.Irm;
7794         printf(" rm=0x%02X=%d,%d,%d",rm,(rm>>6)&3,(rm>>3)&7,rm&7);
7795         if (!I16 && issib(rm))
7796         {
7797             ubyte sib = c.Isib;
7798             printf(" sib=%02x=%d,%d,%d",sib,(sib>>6)&3,(sib>>3)&7,sib&7);
7799         }
7800         if ((rm & 0xC7) == BPRM || (rm & 0xC0) == 0x80 || (rm & 0xC0) == 0x40)
7801         {
7802             switch (c.IFL1)
7803             {
7804                 case FLconst:
7805                 case FLoffset:
7806                     printf(" int = %4d",c.IEV1.Vuns);
7807                     break;
7808 
7809                 case FLblock:
7810                     printf(" block = %p",c.IEV1.Vblock);
7811                     break;
7812 
7813                 case FLswitch:
7814                 case FLblockoff:
7815                 case FLlocalsize:
7816                 case FLframehandler:
7817                 case 0:
7818                     break;
7819 
7820                 case FLdatseg:
7821                     printf(" FLdatseg %d.%llx",c.IEV1.Vseg,cast(ulong)c.IEV1.Vpointer);
7822                     break;
7823 
7824                 case FLauto:
7825                 case FLfast:
7826                 case FLreg:
7827                 case FLdata:
7828                 case FLudata:
7829                 case FLpara:
7830                 case FLbprel:
7831                 case FLtlsdata:
7832                 case FLextern:
7833                     printf(" ");
7834                     WRFL(c.IFL1);
7835                     printf(" sym='%s'",c.IEV1.Vsym.Sident.ptr);
7836                     if (c.IEV1.Voffset)
7837                         printf(".%d", cast(int)c.IEV1.Voffset);
7838                     break;
7839 
7840                 default:
7841                     WRFL(c.IFL1);
7842                     break;
7843             }
7844         }
7845     }
7846     if (ins & T)
7847     {
7848         printf(" ");
7849         WRFL(c.IFL2);
7850         switch (c.IFL2)
7851         {
7852             case FLconst:
7853                 printf(" int = %4d",c.IEV2.Vuns);
7854                 break;
7855 
7856             case FLblock:
7857                 printf(" block = %p",c.IEV2.Vblock);
7858                 break;
7859 
7860             case FLswitch:
7861             case FLblockoff:
7862             case 0:
7863             case FLlocalsize:
7864             case FLframehandler:
7865                 break;
7866 
7867             case FLdatseg:
7868                 printf(" %d.%llx",c.IEV2.Vseg,cast(ulong)c.IEV2.Vpointer);
7869                 break;
7870 
7871             case FLauto:
7872             case FLfast:
7873             case FLreg:
7874             case FLpara:
7875             case FLbprel:
7876             case FLfunc:
7877             case FLdata:
7878             case FLudata:
7879             case FLtlsdata:
7880                 printf(" sym='%s'",c.IEV2.Vsym.Sident.ptr);
7881                 break;
7882 
7883             case FLcode:
7884                 printf(" code = %p",c.IEV2.Vcode);
7885                 break;
7886 
7887             default:
7888                 WRFL(c.IFL2);
7889                 break;
7890         }
7891     }
7892     printf("\n");
7893 }
7894 
7895 /**************************************
7896  * Pretty-print a CF mask.
7897  * Params:
7898  *      cf = CF mask
7899  */
7900 @trusted
7901 extern (C) void CF_print(uint cf)
7902 {
7903     void print(uint mask, const(char)* string)
7904     {
7905         if (cf & mask)
7906         {
7907             printf(string);
7908             cf &= ~mask;
7909             if (cf)
7910                 printf("|");
7911         }
7912     }
7913 
7914     print(CFindirect, "CFindirect");
7915     print(CFswitch, "CFswitch");
7916     print(CFjmp5, "CFjmp5");
7917     print(CFvex3, "CFvex3");
7918     print(CFvex, "CFvex");
7919     print(CFpc32, "CFpc32");
7920     print(CFoffset64, "CFoffset64");
7921     print(CFclassinit, "CFclassinit");
7922     print(CFvolatile, "CFvolatile");
7923     print(CFtarg2, "CFtarg2");
7924     print(CFunambig, "CFunambig");
7925     print(CFselfrel, "CFselfrel");
7926     print(CFwait, "CFwait");
7927     print(CFfs, "CFfs");
7928     print(CFcs, "CFcs");
7929     print(CFds, "CFds");
7930     print(CFss, "CFss");
7931     print(CFes, "CFes");
7932     print(CFaddrsize, "CFaddrsize");
7933     print(CFopsize, "CFopsize");
7934     print(CFpsw, "CFpsw");
7935     print(CFoff, "CFoff");
7936     print(CFseg, "CFseg");
7937     print(CFtarg, "CFtarg");
7938     print(CFjmp16, "CFjmp16");
7939     printf("\n");
7940 }