1 /** 2 * Instruction scheduler 3 * 4 * Compiler implementation of the 5 * $(LINK2 https://www.dlang.org, D programming language). 6 * 7 * Copyright: Copyright (C) 1995-1998 by Symantec 8 * Copyright (C) 2000-2023 by The D Language Foundation, All Rights Reserved 9 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright) 10 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 11 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgsched.c, backend/cgsched.d) 12 */ 13 14 module dmd.backend.cgsched; 15 16 import core.stdc.stdio; 17 import core.stdc.stdlib; 18 import core.stdc.string; 19 20 import dmd.backend.cc; 21 import dmd.backend.cdef; 22 import dmd.backend.cgen : gen1, gen2; 23 import dmd.backend.code; 24 import dmd.backend.code_x86; 25 import dmd.backend.dlist; 26 import dmd.backend.global; 27 import dmd.backend.mem; 28 import dmd.backend.ty; 29 import dmd.backend.barray; 30 31 32 nothrow: 33 @safe: 34 35 // is32bitaddr works correctly only when x is 0 or 1. This is 36 // true today for the current definition of I32, but if the definition 37 // of I32 changes, this macro will need to change as well 38 // 39 // Note: even for linux targets, CFaddrsize can be set by the inline 40 // assembler. 41 private bool is32bitaddr(bool x, uint Iflags) { return I64 || (x ^ ((Iflags & CFaddrsize) != 0)); } 42 43 // If we use Pentium Pro scheduler 44 @trusted 45 private bool PRO() { return config.target_cpu >= TARGET_PentiumPro; } 46 47 private enum FP : ubyte 48 { 49 none = 0, 50 fstp = 1, /// FSTP mem 51 fld = 2, /// FLD mem 52 fop = 3, /// Fop ST0,mem or Fop ST0 53 } 54 55 private enum CIFL : ubyte 56 { 57 arraybounds = 1, /// this instruction is a jmp to array bounds 58 ea = 2, /// this instruction has a memory-referencing 59 /// modregrm EA byte 60 nostage = 4, /// don't stage these instructions 61 push = 8, /// it's a push we can swap around 62 } 63 64 // Struct where we gather information about an instruction 65 struct Cinfo 66 { 67 code *c; // the instruction 68 ubyte pair; // pairing information 69 ubyte sz; // operand size 70 ubyte isz; // instruction size 71 72 // For floating point scheduling 73 ubyte fxch_pre; 74 ubyte fxch_post; 75 FP fp_op; /// FPxxxx 76 77 ubyte flags; /// CIFLxxx 78 79 uint r; // read mask 80 uint w; // write mask 81 uint a; // registers used in addressing mode 82 ubyte reg; // reg field of modregrm byte 83 ubyte uops; // Pentium Pro micro-ops 84 uint sibmodrm; // (sib << 8) + mod__rm byte 85 uint spadjust; // if !=0, then amount ESP changes as a result of this 86 // instruction being executed 87 int fpuadjust; // if !=0, then amount FPU stack changes as a result 88 // of this instruction being executed 89 90 @trusted 91 nothrow void print() // pretty-printer 92 { 93 Cinfo *ci = &this; 94 95 if (ci == null) 96 { 97 printf("Cinfo 0\n"); 98 return; 99 } 100 101 printf("Cinfo %p: c %p, pair %x, sz %d, isz %d, flags - ", 102 ci,c,pair,sz,isz); 103 if (ci.flags & CIFL.arraybounds) 104 printf("arraybounds,"); 105 if (ci.flags & CIFL.ea) 106 printf("ea,"); 107 if (ci.flags & CIFL.nostage) 108 printf("nostage,"); 109 if (ci.flags & CIFL.push) 110 printf("push,"); 111 if (ci.flags & ~(CIFL.arraybounds|CIFL.nostage|CIFL.push|CIFL.ea)) 112 printf("bad flag,"); 113 printf("\n\tr %x w %x a %x reg %x uops %x sibmodrm %x spadjust %d\n", 114 cast(int)r,cast(int)w,cast(int)a,reg,uops,sibmodrm,cast(int)spadjust); 115 if (ci.fp_op) 116 { 117 __gshared const(char*)[3] fpops = ["fstp","fld","fop"]; 118 119 printf("\tfp_op %s, fxch_pre %x, fxch_post %x\n", 120 fpops[fp_op-1],fxch_pre,fxch_post); 121 } 122 } 123 124 } 125 126 127 /***************************************** 128 * Do Pentium optimizations. 129 * Input: 130 * scratch scratch registers we can use 131 */ 132 133 @trusted 134 private void cgsched_pentium(code **pc,regm_t scratch) 135 { 136 //printf("scratch = x%02x\n",scratch); 137 if (config.target_scheduler >= TARGET_80486) 138 { 139 if (!I64) 140 *pc = peephole(*pc,0); 141 if (I32) // forget about 16 bit code 142 { 143 if (config.target_cpu == TARGET_Pentium || 144 config.target_cpu == TARGET_PentiumMMX) 145 *pc = simpleops(*pc,scratch); 146 *pc = schedule(*pc,0); 147 } 148 } 149 } 150 151 /************************************ 152 * Entry point 153 */ 154 @trusted 155 public void cgsched_block(block* b) 156 { 157 if (config.flags4 & CFG4speed && 158 config.target_cpu >= TARGET_Pentium && 159 b.BC != BCasm) 160 { 161 regm_t scratch = allregs; 162 163 scratch &= ~(b.Bregcon.used | b.Bregcon.params | mfuncreg); 164 scratch &= ~(b.Bregcon.immed.mval | b.Bregcon.cse.mval); 165 cgsched_pentium(&b.Bcode,scratch); 166 //printf("after schedule:\n"); WRcodlst(b.Bcode); 167 } 168 } 169 170 enum 171 { 172 NP = 0, /// not pairable 173 PU = 1, /// pairable in U only, never executed in V 174 PV = 2, /// pairable in V only 175 UV = (PU|PV), /// pairable in both U and V 176 PE = 4, /// register contention exception 177 PF = 8, /// flags contention exception 178 FX = 0x10, /// pairable with FXCH instruction 179 } 180 181 extern (D) private immutable ubyte[256] pentcycl = 182 [ 183 UV,UV,UV,UV, UV,UV,NP,NP, // 0 184 UV,UV,UV,UV, UV,UV,NP,NP, // 8 185 PU,PU,PU,PU, PU,PU,NP,NP, // 10 186 PU,PU,PU,PU, PU,PU,NP,NP, // 18 187 UV,UV,UV,UV, UV,UV,NP,NP, // 20 188 UV,UV,UV,UV, UV,UV,NP,NP, // 28 189 UV,UV,UV,UV, UV,UV,NP,NP, // 30 190 UV,UV,UV,UV, UV,UV,NP,NP, // 38 191 192 UV,UV,UV,UV, UV,UV,UV,UV, // 40 193 UV,UV,UV,UV, UV,UV,UV,UV, // 48 194 PE|UV,PE|UV,PE|UV,PE|UV, PE|UV,PE|UV,PE|UV,PE|UV, // 50 PUSH reg 195 PE|UV,PE|UV,PE|UV,PE|UV, PE|UV,PE|UV,PE|UV,PE|UV, // 58 POP reg 196 NP,NP,NP,NP, NP,NP,NP,NP, // 60 197 PE|UV,NP,PE|UV,NP, NP,NP,NP,NP, // 68 198 PV|PF,PV|PF,PV|PF,PV|PF, PV|PF,PV|PF,PV|PF,PV|PF, // 70 Jcc rel8 199 PV|PF,PV|PF,PV|PF,PV|PF, PV|PF,PV|PF,PV|PF,PV|PF, // 78 Jcc rel8 200 201 NP,NP,NP,NP, NP,NP,NP,NP, // 80 202 UV,UV,UV,UV, NP,UV,NP,NP, // 88 203 NP,NP,NP,NP, NP,NP,NP,NP, // 90 204 NP,NP,NP,NP, NP,NP,NP,NP, // 98 205 UV,UV,UV,UV, NP,NP,NP,NP, // A0 206 UV,UV,NP,NP, NP,NP,NP,NP, // A8 207 UV,UV,UV,UV, UV,UV,UV,UV, // B0 208 UV,UV,UV,UV, UV,UV,UV,UV, // B8 209 210 NP,NP,NP,NP, NP,NP,NP,NP, // C0 211 NP,NP,NP,NP, NP,NP,NP,NP, // C8 212 PU,PU,NP,NP, NP,NP,NP,NP, // D0 213 FX,NP,FX,FX, NP,NP,FX,NP, // D8 all floating point 214 NP,NP,NP,NP, NP,NP,NP,NP, // E0 215 PE|PV,PV,NP,PV, NP,NP,NP,NP, // E8 216 NP,NP,NP,NP, NP,NP,NP,NP, // F0 217 NP,NP,NP,NP, NP,NP,NP,NP, // F8 218 ]; 219 220 /******************************************** 221 * For each opcode, determine read [0] and written [1] masks. 222 */ 223 224 enum 225 { 226 EA = 0x100000, 227 R = 0x200000, /// register (reg of modregrm field) 228 N = 0x400000, /// other things modified, not swappable 229 B = 0x800000, /// it's a byte operation 230 C = 0x1000000, /// floating point flags 231 mMEM = 0x2000000, /// memory 232 S = 0x4000000, /// floating point stack 233 F = 0x8000000, /// flags 234 } 235 236 extern (D) private immutable uint[2][256] oprw = 237 [ 238 // 00 239 [ EA|R|B, F|EA|B ], // ADD 240 [ EA|R, F|EA ], 241 [ EA|R|B, F|R|B ], 242 [ EA|R, F|R ], 243 [ mAX, F|mAX ], 244 [ mAX, F|mAX ], 245 [ N, N ], // PUSH ES 246 [ N, N ], // POP ES 247 248 // 08 249 [ EA|R|B, F|EA|B ], // OR 250 [ EA|R, F|EA ], 251 [ EA|R|B, F|R|B ], 252 [ EA|R, F|R ], 253 [ mAX, F|mAX ], 254 [ mAX, F|mAX ], 255 [ N, N ], // PUSH CS 256 [ N, N ], // 2 byte escape 257 258 // 10 259 [ F|EA|R|B,F|EA|B ], // ADC 260 [ F|EA|R, F|EA ], 261 [ F|EA|R|B,F|R|B ], 262 [ F|EA|R, F|R ], 263 [ F|mAX, F|mAX ], 264 [ F|mAX, F|mAX ], 265 [ N, N ], // PUSH SS 266 [ N, N ], // POP SS 267 268 // 18 269 [ F|EA|R|B,F|EA|B ], // SBB 270 [ F|EA|R, F|EA ], 271 [ F|EA|R|B,F|R|B ], 272 [ F|EA|R, F|R ], 273 [ F|mAX, F|mAX ], 274 [ F|mAX, F|mAX ], 275 [ N, N ], // PUSH DS 276 [ N, N ], // POP DS 277 278 // 20 279 [ EA|R|B, F|EA|B ], // AND 280 [ EA|R, F|EA ], 281 [ EA|R|B, F|R|B ], 282 [ EA|R, F|R ], 283 [ mAX, F|mAX ], 284 [ mAX, F|mAX ], 285 [ N, N ], // SEG ES 286 [ F|mAX, F|mAX ], // DAA 287 288 // 28 289 [ EA|R|B, F|EA|B ], // SUB 290 [ EA|R, F|EA ], 291 [ EA|R|B, F|R|B ], 292 [ EA|R, F|R ], 293 [ mAX, F|mAX ], 294 [ mAX, F|mAX ], 295 [ N, N ], // SEG CS 296 [ F|mAX, F|mAX ], // DAS 297 298 // 30 299 [ EA|R|B, F|EA|B ], // XOR 300 [ EA|R, F|EA ], 301 [ EA|R|B, F|R|B ], 302 [ EA|R, F|R ], 303 [ mAX, F|mAX ], 304 [ mAX, F|mAX ], 305 [ N, N ], // SEG SS 306 [ F|mAX, F|mAX ], // AAA 307 308 // 38 309 [ EA|R|B, F ], // CMP 310 [ EA|R, F ], 311 [ EA|R|B, F ], 312 [ EA|R, F ], 313 [ mAX, F ], // CMP AL,imm8 314 [ mAX, F ], // CMP EAX,imm16/32 315 [ N, N ], // SEG DS 316 [ N, N ], // AAS 317 318 // 40 319 [ mAX, F|mAX ], // INC EAX 320 [ mCX, F|mCX ], 321 [ mDX, F|mDX ], 322 [ mBX, F|mBX ], 323 [ mSP, F|mSP ], 324 [ mBP, F|mBP ], 325 [ mSI, F|mSI ], 326 [ mDI, F|mDI ], 327 328 // 48 329 [ mAX, F|mAX ], // DEC EAX 330 [ mCX, F|mCX ], 331 [ mDX, F|mDX ], 332 [ mBX, F|mBX ], 333 [ mSP, F|mSP ], 334 [ mBP, F|mBP ], 335 [ mSI, F|mSI ], 336 [ mDI, F|mDI ], 337 338 // 50 339 [ mAX|mSP, mSP|mMEM ], // PUSH EAX 340 [ mCX|mSP, mSP|mMEM ], 341 [ mDX|mSP, mSP|mMEM ], 342 [ mBX|mSP, mSP|mMEM ], 343 [ mSP|mSP, mSP|mMEM ], 344 [ mBP|mSP, mSP|mMEM ], 345 [ mSI|mSP, mSP|mMEM ], 346 [ mDI|mSP, mSP|mMEM ], 347 348 // 58 349 [ mSP|mMEM, mAX|mSP ], // POP EAX 350 [ mSP|mMEM, mCX|mSP ], 351 [ mSP|mMEM, mDX|mSP ], 352 [ mSP|mMEM, mBX|mSP ], 353 [ mSP|mMEM, mSP|mSP ], 354 [ mSP|mMEM, mBP|mSP ], 355 [ mSP|mMEM, mSI|mSP ], 356 [ mSP|mMEM, mDI|mSP ], 357 358 // 60 359 [ N, N ], // PUSHA 360 [ N, N ], // POPA 361 [ N, N ], // BOUND Gv,Ma 362 [ N, N ], // ARPL Ew,Rw 363 [ N, N ], // SEG FS 364 [ N, N ], // SEG GS 365 [ N, N ], // operand size prefix 366 [ N, N ], // address size prefix 367 368 // 68 369 [ mSP, mSP|mMEM ], // PUSH immed16/32 370 [ EA, F|R ], // IMUL Gv,Ev,lv 371 [ mSP, mSP|mMEM ], // PUSH immed8 372 [ EA, F|R ], // IMUL Gv,Ev,lb 373 [ N, N ], // INSB Yb,DX 374 [ N, N ], // INSW/D Yv,DX 375 [ N, N ], // OUTSB DX,Xb 376 [ N, N ], // OUTSW/D DX,Xv 377 378 // 70 379 [ F|N, N ], 380 [ F|N, N ], 381 [ F|N, N ], 382 [ F|N, N ], 383 [ F|N, N ], 384 [ F|N, N ], 385 [ F|N, N ], 386 [ F|N, N ], 387 388 // 78 389 [ F|N, N ], 390 [ F|N, N ], 391 [ F|N, N ], 392 [ F|N, N ], 393 [ F|N, N ], 394 [ F|N, N ], 395 [ F|N, N ], 396 [ F|N, N ], 397 398 // 80 399 [ N, N ], 400 [ N, N ], 401 [ N, N ], 402 [ N, N ], 403 [ EA|R, F ], // TEST EA,r8 404 [ EA|R, F ], // TEST EA,r16/32 405 [ EA|R, EA|R ], // XCHG EA,r8 406 [ EA|R, EA|R ], // XCHG EA,r16/32 407 408 // 88 409 [ R|B, EA|B ], // MOV EA8,r8 410 [ R, EA ], // MOV EA,r16/32 411 [ EA|B, R|B ], // MOV r8,EA8 412 [ EA, R ], // MOV r16/32,EA 413 [ N, N ], // MOV EA,segreg 414 [ EA, R ], // LEA r16/32,EA 415 [ N, N ], // MOV segreg,EA 416 [ mSP|mMEM, EA|mSP ], // POP mem16/32 417 418 // 90 419 [ 0, 0 ], // NOP 420 [ mAX|mCX, mAX|mCX ], 421 [ mAX|mDX, mAX|mDX ], 422 [ mAX|mBX, mAX|mBX ], 423 [ mAX|mSP, mAX|mSP ], 424 [ mAX|mBP, mAX|mBP ], 425 [ mAX|mSI, mAX|mSI ], 426 [ mAX|mDI, mAX|mDI ], 427 428 // 98 429 [ mAX, mAX ], // CBW 430 [ mAX, mDX ], // CWD 431 [ N, N|F ], // CALL far ptr 432 [ N, N ], // WAIT 433 [ F|mSP, mSP|mMEM ], // PUSHF 434 [ mSP|mMEM, F|mSP ], // POPF 435 [ mAX, F ], // SAHF 436 [ F, mAX ], // LAHF 437 438 // A0 439 [ mMEM, mAX ], // MOV AL,moffs8 440 [ mMEM, mAX ], // MOV EAX,moffs32 441 [ mAX, mMEM ], // MOV moffs8,AL 442 [ mAX, mMEM ], // MOV moffs32,EAX 443 [ N, N ], // MOVSB 444 [ N, N ], // MOVSW/D 445 [ N, N ], // CMPSB 446 [ N, N ], // CMPSW/D 447 448 // A8 449 [ mAX, F ], // TEST AL,imm8 450 [ mAX, F ], // TEST AX,imm16 451 [ N, N ], // STOSB 452 [ N, N ], // STOSW/D 453 [ N, N ], // LODSB 454 [ N, N ], // LODSW/D 455 [ N, N ], // SCASB 456 [ N, N ], // SCASW/D 457 458 // B0 459 [ 0, mAX ], // MOV AL,imm8 460 [ 0, mCX ], 461 [ 0, mDX ], 462 [ 0, mBX ], 463 [ 0, mAX ], 464 [ 0, mCX ], 465 [ 0, mDX ], 466 [ 0, mBX ], 467 468 // B8 469 [ 0, mAX ], // MOV AX,imm16 470 [ 0, mCX ], 471 [ 0, mDX ], 472 [ 0, mBX ], 473 [ 0, mSP ], 474 [ 0, mBP ], 475 [ 0, mSI ], 476 [ 0, mDI ], 477 478 // C0 479 [ EA, F|EA ], // Shift Eb,Ib 480 [ EA, F|EA ], 481 [ N, N ], 482 [ N, N ], 483 [ N, N ], 484 [ N, N ], 485 [ 0, EA|B ], // MOV EA8,imm8 486 [ 0, EA ], // MOV EA,imm16 487 488 // C8 489 [ N, N ], // ENTER 490 [ N, N ], // LEAVE 491 [ N, N ], // RETF lw 492 [ N, N ], // RETF 493 [ N, N ], // INT 3 494 [ N, N ], // INT lb 495 [ N, N ], // INTO 496 [ N, N ], // IRET 497 498 // D0 499 [ EA, F|EA ], // Shift EA,1 500 [ EA, F|EA ], 501 [ EA|mCX, F|EA ], // Shift EA,CL 502 [ EA|mCX, F|EA ], 503 [ mAX, F|mAX ], // AAM 504 [ mAX, F|mAX ], // AAD 505 [ N, N ], // reserved 506 [ mAX|mBX|mMEM, mAX ], // XLAT 507 508 // D8 509 [ N, N ], 510 [ N, N ], 511 [ N, N ], 512 [ N, N ], 513 [ N, N ], 514 [ N, N ], 515 [ N, N ], 516 [ N, N ], 517 518 // E0 519 [ F|mCX|N,mCX|N ], // LOOPNE jb 520 [ F|mCX|N,mCX|N ], // LOOPE jb 521 [ mCX|N, mCX|N ], // LOOP jb 522 [ mCX|N, N ], // JCXZ jb 523 [ N, N ], // IN AL,lb 524 [ N, N ], // IN EAX,lb 525 [ N, N ], // OUT lb,AL 526 [ N, N ], // OUT lb,EAX 527 528 // E8 529 [ N, N|F ], // CALL jv 530 [ N, N ], // JMP Jv 531 [ N, N ], // JMP Ab 532 [ N, N ], // JMP jb 533 [ N|mDX, N|mAX ], // IN AL,DX 534 [ N|mDX, N|mAX ], // IN AX,DX 535 [ N|mAX|mDX,N ], // OUT DX,AL 536 [ N|mAX|mDX,N ], // OUT DX,AX 537 538 // F0 539 [ N, N ], // LOCK 540 [ N, N ], // reserved 541 [ N, N ], // REPNE 542 [ N, N ], // REP,REPE 543 [ N, N ], // HLT 544 [ F, F ], // CMC 545 [ N, N ], 546 [ N, N ], 547 548 // F8 549 [ 0, F ], // CLC 550 [ 0, F ], // STC 551 [ N, N ], // CLI 552 [ N, N ], // STI 553 [ N, N ], // CLD 554 [ N, N ], // STD 555 [ EA, F|EA ], // INC/DEC 556 [ N, N ], 557 ]; 558 559 /**************************************** 560 * Same thing, but for groups. 561 */ 562 563 extern (D) private immutable uint[2][8][8] grprw = 564 [ 565 [ 566 // Grp 1 567 [ EA, F|EA ], // ADD 568 [ EA, F|EA ], // OR 569 [ F|EA, F|EA ], // ADC 570 [ F|EA, F|EA ], // SBB 571 [ EA, F|EA ], // AND 572 [ EA, F|EA ], // SUB 573 [ EA, F|EA ], // XOR 574 [ EA, F ], // CMP 575 ], 576 [ 577 // Grp 3 578 [ EA, F ], // TEST EA,imm 579 [ N, N ], // reserved 580 [ EA, EA ], // NOT 581 [ EA, F|EA ], // NEG 582 [ mAX|EA, F|mAX|mDX ], // MUL 583 [ mAX|EA, F|mAX|mDX ], // IMUL 584 [ mAX|mDX|EA, F|mAX|mDX ], // DIV 585 586 // Could generate an exception we want to catch 587 //mAX|mDX|EA|N, F|mAX|mDX|N, // IDIV 588 589 [ mAX|mDX|EA, F|mAX|mDX ], // IDIV 590 ], 591 [ 592 // Grp 5 593 [ EA, F|EA ], // INC Ev 594 [ EA, F|EA ], // DEC Ev 595 [ N|EA, N ], // CALL Ev 596 [ N|EA, N ], // CALL eP 597 [ N|EA, N ], // JMP Ev 598 [ N|EA, N ], // JMP Ep 599 [ mSP|EA, mSP|mMEM ], // PUSH Ev 600 [ N, N ], // reserved 601 ], 602 [ 603 // Grp 3, byte version 604 [ EA|B, F ], // TEST EA,imm 605 [ N, N ], // reserved 606 [ EA|B, EA|B ], // NOT 607 [ EA|B, F|EA|B ], // NEG 608 [ mAX|EA, F|mAX ], // MUL 609 [ mAX|EA, F|mAX ], // IMUL 610 [ mAX|EA, F|mAX ], // DIV 611 612 // Could generate an exception we want to catch 613 //mAX|EA|N, F|mAX|N, // IDIV 614 615 [ mAX|EA, F|mAX ], // IDIV 616 ] 617 ]; 618 619 /******************************************** 620 * For floating point opcodes 0xD8..0xDF, with Irm < 0xC0. 621 * [][][0] = read 622 * [1] = write 623 */ 624 625 extern (D) private immutable uint[2][8][8] grpf1 = 626 [ 627 [ 628 // 0xD8 629 [ EA|S, S|C ], // FADD float 630 [ EA|S, S|C ], // FMUL float 631 [ EA|S, C ], // FCOM float 632 [ EA|S, S|C ], // FCOMP float 633 [ EA|S, S|C ], // FSUB float 634 [ EA|S, S|C ], // FSUBR float 635 [ EA|S, S|C ], // FDIV float 636 [ EA|S, S|C ], // FDIVR float 637 ], 638 [ 639 // 0xD9 640 [ EA, S|C ], // FLD float 641 [ N, N ], // 642 [ S, EA|C ], // FST float 643 [ S, EA|S|C ], // FSTP float 644 [ N, N ], // FLDENV 645 [ N, N ], // FLDCW 646 [ N, N ], // FSTENV 647 [ N, N ], // FSTCW 648 ], 649 [ 650 // 0xDA 651 [ EA|S, S|C ], // FIADD long 652 [ EA|S, S|C ], // FIMUL long 653 [ EA|S, C ], // FICOM long 654 [ EA|S, S|C ], // FICOMP long 655 [ EA|S, S|C ], // FISUB long 656 [ EA|S, S|C ], // FISUBR long 657 [ EA|S, S|C ], // FIDIV long 658 [ EA|S, S|C ], // FIDIVR long 659 ], 660 [ 661 // 0xDB 662 [ EA, S|C ], // FILD long 663 [ S, EA|S|C ], // FISTTP int 664 [ S, EA|C ], // FIST long 665 [ S, EA|S|C ], // FISTP long 666 [ N, N ], // 667 [ EA, S|C ], // FLD real80 668 [ N, N ], // 669 [ S, EA|S|C ], // FSTP real80 670 ], 671 [ 672 // 0xDC 673 [ EA|S, S|C ], // FADD double 674 [ EA|S, S|C ], // FMUL double 675 [ EA|S, C ], // FCOM double 676 [ EA|S, S|C ], // FCOMP double 677 [ EA|S, S|C ], // FSUB double 678 [ EA|S, S|C ], // FSUBR double 679 [ EA|S, S|C ], // FDIV double 680 [ EA|S, S|C ], // FDIVR double 681 ], 682 [ 683 // 0xDD 684 [ EA, S|C ], // FLD double 685 [ S, EA|S|C ], // FISTTP long 686 [ S, EA|C ], // FST double 687 [ S, EA|S|C ], // FSTP double 688 [ N, N ], // FRSTOR 689 [ N, N ], // 690 [ N, N ], // FSAVE 691 [ C, EA ], // FSTSW 692 ], 693 [ 694 // 0xDE 695 [ EA|S, S|C ], // FIADD short 696 [ EA|S, S|C ], // FIMUL short 697 [ EA|S, C ], // FICOM short 698 [ EA|S, S|C ], // FICOMP short 699 [ EA|S, S|C ], // FISUB short 700 [ EA|S, S|C ], // FISUBR short 701 [ EA|S, S|C ], // FIDIV short 702 [ EA|S, S|C ], // FIDIVR short 703 ], 704 [ 705 // 0xDF 706 [ EA, S|C ], // FILD short 707 [ S, EA|S|C ], // FISTTP short 708 [ S, EA|C ], // FIST short 709 [ S, EA|S|C ], // FISTP short 710 [ EA, S|C ], // FBLD packed BCD 711 [ EA, S|C ], // FILD long long 712 [ S, EA|S|C ], // FBSTP packed BCD 713 [ S, EA|S|C ], // FISTP long long 714 ] 715 ]; 716 717 718 /******************************************** 719 * Micro-ops for floating point opcodes 0xD8..0xDF, with Irm < 0xC0. 720 */ 721 722 extern (D) private immutable ubyte[8][8] uopsgrpf1 = 723 [ 724 [ 725 // 0xD8 726 2, // FADD float 727 2, // FMUL float 728 2, // FCOM float 729 2, // FCOMP float 730 2, // FSUB float 731 2, // FSUBR float 732 2, // FDIV float 733 2, // FDIVR float 734 ], 735 [ 736 // 0xD9 737 1, // FLD float 738 0, // 739 2, // FST float 740 2, // FSTP float 741 5, // FLDENV 742 3, // FLDCW 743 5, // FSTENV 744 5, // FSTCW 745 ], 746 [ 747 // 0xDA 748 5, // FIADD long 749 5, // FIMUL long 750 5, // FICOM long 751 5, // FICOMP long 752 5, // FISUB long 753 5, // FISUBR long 754 5, // FIDIV long 755 5, // FIDIVR long 756 ], 757 [ 758 // 0xDB 759 4, // FILD long 760 0, // 761 4, // FIST long 762 4, // FISTP long 763 0, // 764 4, // FLD real80 765 0, // 766 5, // FSTP real80 767 ], 768 [ 769 // 0xDC 770 2, // FADD double 771 2, // FMUL double 772 2, // FCOM double 773 2, // FCOMP double 774 2, // FSUB double 775 2, // FSUBR double 776 2, // FDIV double 777 2, // FDIVR double 778 ], 779 [ 780 // 0xDD 781 1, // FLD double 782 0, // 783 2, // FST double 784 2, // FSTP double 785 5, // FRSTOR 786 0, // 787 5, // FSAVE 788 5, // FSTSW 789 ], 790 [ 791 // 0xDE 792 5, // FIADD short 793 5, // FIMUL short 794 5, // FICOM short 795 5, // FICOMP short 796 5, // FISUB short 797 5, // FISUBR short 798 5, // FIDIV short 799 5, // FIDIVR short 800 ], 801 [ 802 // 0xDF 803 4, // FILD short 804 0, // 805 4, // FIST short 806 4, // FISTP short 807 5, // FBLD packed BCD 808 4, // FILD long long 809 5, // FBSTP packed BCD 810 4, // FISTP long long 811 ] 812 ]; 813 814 /************************************************** 815 * Determine number of micro-ops for Pentium Pro and Pentium II processors. 816 * 0 means special case, 817 * 5 means 'complex' 818 */ 819 820 extern (D) private immutable ubyte[256] insuops = 821 [ 0,0,0,0, 1,1,4,5, /* 00 */ 822 0,0,0,0, 1,1,4,0, /* 08 */ 823 0,0,0,0, 2,2,4,5, /* 10 */ 824 0,0,0,0, 2,2,4,5, /* 18 */ 825 0,0,0,0, 1,1,0,1, /* 20 */ 826 0,0,0,0, 1,1,0,1, /* 28 */ 827 0,0,0,0, 1,1,0,1, /* 30 */ 828 0,0,0,0, 1,1,0,1, /* 38 */ 829 1,1,1,1, 1,1,1,1, /* 40 */ 830 1,1,1,1, 1,1,1,1, /* 48 */ 831 3,3,3,3, 3,3,3,3, /* 50 */ 832 2,2,2,2, 3,2,2,2, /* 58 */ 833 5,5,5,5, 0,0,0,0, /* 60 */ 834 3,3,0,0, 5,5,5,5, /* 68 */ 835 1,1,1,1, 1,1,1,1, /* 70 */ 836 1,1,1,1, 1,1,1,1, /* 78 */ 837 0,0,0,0, 0,0,0,0, /* 80 */ 838 0,0,0,0, 0,1,4,0, /* 88 */ 839 1,3,3,3, 3,3,3,3, /* 90 */ 840 1,1,5,0, 5,5,1,1, /* 98 */ 841 1,1,2,2, 5,5,5,5, /* A0 */ 842 1,1,3,3, 2,2,3,3, /* A8 */ 843 1,1,1,1, 1,1,1,1, /* B0 */ 844 1,1,1,1, 1,1,1,1, /* B8 */ 845 0,0,5,4, 0,0,0,0, /* C0 */ 846 5,3,5,5, 5,3,5,5, /* C8 */ 847 0,0,0,0, 4,3,0,2, /* D0 */ 848 0,0,0,0, 0,0,0,0, /* D8 */ 849 4,4,4,2, 5,5,5,5, /* E0 */ 850 4,1,5,1, 5,5,5,5, /* E8 */ 851 0,0,5,5, 5,1,0,0, /* F0 */ 852 1,1,5,5, 4,4,0,0, /* F8 */ 853 ]; 854 855 extern (D) private immutable ubyte[8] uopsx = [ 1,1,2,5,1,1,1,5 ]; 856 857 /************************************************ 858 * Determine number of micro-ops for Pentium Pro and Pentium II processors. 859 * 5 means 'complex'. 860 * Doesn't currently handle: 861 * floating point 862 * MMX 863 * 0F opcodes 864 * prefix bytes 865 */ 866 867 private int uops(code *c) 868 { int n; 869 int op; 870 int op2; 871 872 op = c.Iop & 0xFF; 873 if ((c.Iop & 0xFF00) == 0x0F00) 874 op = 0x0F; 875 n = insuops[op]; 876 if (!n) // if special case 877 { ubyte irm,mod,reg,rm; 878 879 irm = c.Irm; 880 mod = (irm >> 6) & 3; 881 reg = (irm >> 3) & 7; 882 rm = irm & 7; 883 884 switch (op) 885 { 886 case 0x10: 887 case 0x11: // ADC rm,r 888 case 0x18: 889 case 0x19: // SBB rm,r 890 n = (mod == 3) ? 2 : 4; 891 break; 892 893 case 0x12: 894 case 0x13: // ADC r,rm 895 case 0x1A: 896 case 0x1B: // SBB r,rm 897 n = (mod == 3) ? 2 : 3; 898 break; 899 900 case 0x00: 901 case 0x01: // ADD rm,r 902 case 0x08: 903 case 0x09: // OR rm,r 904 case 0x20: 905 case 0x21: // AND rm,r 906 case 0x28: 907 case 0x29: // SUB rm,r 908 case 0x30: 909 case 0x31: // XOR rm,r 910 n = (mod == 3) ? 1 : 4; 911 break; 912 913 case 0x02: 914 case 0x03: // ADD r,rm 915 case 0x0A: 916 case 0x0B: // OR r,rm 917 case 0x22: 918 case 0x23: // AND r,rm 919 case 0x2A: 920 case 0x2B: // SUB r,rm 921 case 0x32: 922 case 0x33: // XOR r,rm 923 case 0x38: 924 case 0x39: // CMP rm,r 925 case 0x3A: 926 case 0x3B: // CMP r,rm 927 case 0x69: // IMUL rm,r,imm 928 case 0x6B: // IMUL rm,r,imm8 929 case 0x84: 930 case 0x85: // TEST rm,r 931 n = (mod == 3) ? 1 : 2; 932 break; 933 934 case 0x80: 935 case 0x81: 936 case 0x82: 937 case 0x83: 938 if (reg == 2 || reg == 3) // ADC/SBB rm,imm 939 n = (mod == 3) ? 2 : 4; 940 else if (reg == 7) // CMP rm,imm 941 n = (mod == 3) ? 1 : 2; 942 else 943 n = (mod == 3) ? 1 : 4; 944 break; 945 946 case 0x86: 947 case 0x87: // XCHG rm,r 948 n = (mod == 3) ? 3 : 5; 949 break; 950 951 case 0x88: 952 case 0x89: // MOV rm,r 953 n = (mod == 3) ? 1 : 2; 954 break; 955 956 case 0x8A: 957 case 0x8B: // MOV r,rm 958 n = 1; 959 break; 960 961 case 0x8C: // MOV Sreg,rm 962 n = (mod == 3) ? 1 : 3; 963 break; 964 965 case 0x8F: 966 if (reg == 0) // POP m 967 n = 5; 968 break; 969 970 case 0xC6: 971 case 0xC7: 972 if (reg == 0) // MOV rm,imm 973 n = (mod == 3) ? 1 : 2; 974 break; 975 976 case 0xD0: 977 case 0xD1: 978 if (reg == 2 || reg == 3) // RCL/RCR rm,1 979 n = (mod == 3) ? 2 : 4; 980 else 981 n = (mod == 3) ? 1 : 4; 982 break; 983 984 case 0xC0: 985 case 0xC1: // RCL/RCR rm,imm8 986 case 0xD2: 987 case 0xD3: 988 if (reg == 2 || reg == 3) // RCL/RCR rm,CL 989 n = 5; 990 else 991 n = (mod == 3) ? 1 : 4; 992 break; 993 994 case 0xD8: 995 case 0xD9: 996 case 0xDA: 997 case 0xDB: 998 case 0xDC: 999 case 0xDD: 1000 case 0xDE: 1001 case 0xDF: 1002 // Floating point opcodes 1003 if (irm < 0xC0) 1004 { n = uopsgrpf1[op - 0xD8][reg]; 1005 break; 1006 } 1007 n = uopsx[op - 0xD8]; 1008 switch (op) 1009 { 1010 case 0xD9: 1011 switch (irm) 1012 { 1013 case 0xE0: // FCHS 1014 n = 3; 1015 break; 1016 case 0xE8: 1017 case 0xE9: 1018 case 0xEA: 1019 case 0xEB: 1020 case 0xEC: 1021 case 0xED: 1022 n = 2; 1023 break; 1024 case 0xF0: 1025 case 0xF1: 1026 case 0xF2: 1027 case 0xF3: 1028 case 0xF4: 1029 case 0xF5: 1030 case 0xF8: 1031 case 0xF9: 1032 case 0xFB: 1033 case 0xFC: 1034 case 0xFD: 1035 case 0xFE: 1036 case 0xFF: 1037 n = 5; 1038 break; 1039 1040 default: 1041 break; 1042 } 1043 break; 1044 case 0xDE: 1045 if (irm == 0xD9) // FCOMPP 1046 n = 2; 1047 break; 1048 1049 default: 1050 break; 1051 } 1052 break; 1053 1054 case 0xF6: 1055 if (reg == 6 || reg == 7) // DIV AL,rm8 1056 n = (mod == 3) ? 3 : 4; 1057 else if (reg == 4 || reg == 5 || reg == 0) // MUL/IMUL/TEST rm8 1058 n = (mod == 3) ? 1 : 2; 1059 else if (reg == 2 || reg == 3) // NOT/NEG rm 1060 n = (mod == 3) ? 1 : 4; 1061 break; 1062 1063 case 0xF7: 1064 if (reg == 6 || reg == 7) // DIV EAX,rm 1065 n = 4; 1066 else if (reg == 4 || reg == 5) // MUL/IMUL rm 1067 n = (mod == 3) ? 3 : 4; 1068 else if (reg == 2 || reg == 3) // NOT/NEG rm 1069 n = (mod == 3) ? 1 : 4; 1070 break; 1071 1072 case 0xFF: 1073 if (reg == 2 || reg == 3 || // CALL rm, CALL m,rm 1074 reg == 5) // JMP seg:offset 1075 n = 5; 1076 else if (reg == 4) 1077 n = (mod == 3) ? 1 : 2; 1078 else if (reg == 0 || reg == 1) // INC/DEC rm 1079 n = (mod == 3) ? 1 : 4; 1080 else if (reg == 6) // PUSH rm 1081 n = (mod == 3) ? 3 : 4; 1082 break; 1083 1084 case 0x0F: 1085 op2 = c.Iop & 0xFF; 1086 if ((op2 & 0xF0) == 0x80) // Jcc 1087 { n = 1; 1088 break; 1089 } 1090 if ((op2 & 0xF0) == 0x90) // SETcc 1091 { n = (mod == 3) ? 1 : 3; 1092 break; 1093 } 1094 if (op2 == 0xB6 || op2 == 0xB7 || // MOVZX 1095 op2 == 0xBE || op2 == 0xBF) // MOVSX 1096 { n = 1; 1097 break; 1098 } 1099 if (op2 == 0xAF) // IMUL r,m 1100 { n = (mod == 3) ? 1 : 2; 1101 break; 1102 } 1103 break; 1104 1105 default: 1106 break; 1107 } 1108 } 1109 if (n == 0) 1110 n = 5; // copout for now 1111 return n; 1112 } 1113 1114 /****************************************** 1115 * Determine pairing classification. 1116 * Don't deal with floating point, just assume they are all NP (Not Pairable). 1117 * Returns: 1118 * NP,UV,PU,PV optionally OR'd with PE 1119 */ 1120 1121 private int pair_class(code *c) 1122 { ubyte op; 1123 ubyte irm,mod,reg,rm; 1124 uint a32; 1125 int pc; 1126 1127 // Of course, with Intel this is *never* simple, and Intel's 1128 // documentation is vague about the specifics. 1129 1130 op = c.Iop & 0xFF; 1131 if ((c.Iop & 0xFF00) == 0x0F00) 1132 op = 0x0F; 1133 pc = pentcycl[op]; 1134 a32 = I32; 1135 if (c.Iflags & CFaddrsize) 1136 a32 ^= 1; 1137 irm = c.Irm; 1138 mod = (irm >> 6) & 3; 1139 reg = (irm >> 3) & 7; 1140 rm = irm & 7; 1141 switch (op) 1142 { 1143 case 0x0F: // 2 byte opcode 1144 if ((c.Iop & 0xF0) == 0x80) // if Jcc 1145 pc = PV | PF; 1146 break; 1147 1148 case 0x80: 1149 case 0x81: 1150 case 0x83: 1151 if (reg == 2 || // ADC EA,immed 1152 reg == 3) // SBB EA,immed 1153 { pc = PU; 1154 goto L2; 1155 } 1156 goto L1; // AND/OR/XOR/ADD/SUB/CMP EA,immed 1157 1158 case 0x84: 1159 case 0x85: // TEST EA,reg 1160 if (mod == 3) // TEST reg,reg 1161 pc = UV; 1162 break; 1163 1164 case 0xC0: 1165 case 0xC1: 1166 if (reg >= 4) 1167 pc = PU; 1168 break; 1169 1170 case 0xC6: 1171 case 0xC7: 1172 if (reg == 0) // MOV EA,immed 1173 { 1174 L1: 1175 pc = UV; 1176 L2: 1177 // if EA contains a displacement then 1178 // can't execute in V, or pair in U 1179 switch (mod) 1180 { case 0: 1181 if (a32) 1182 { if (rm == 5 || 1183 (rm == 4 && (c.Isib & 7) == 5) 1184 ) 1185 pc = NP; 1186 } 1187 else if (rm == 6) 1188 pc = NP; 1189 break; 1190 case 1: 1191 case 2: 1192 pc = NP; 1193 break; 1194 1195 default: 1196 break; 1197 } 1198 } 1199 break; 1200 1201 case 0xD9: 1202 if (irm < 0xC0) 1203 { 1204 if (reg == 0) 1205 pc = FX; 1206 } 1207 else if (irm < 0xC8) 1208 pc = FX; 1209 else if (irm < 0xD0) 1210 pc = PV; 1211 else 1212 { 1213 switch (irm) 1214 { 1215 case 0xE0: 1216 case 0xE1: 1217 case 0xE4: 1218 pc = FX; 1219 break; 1220 1221 default: 1222 break; 1223 } 1224 } 1225 break; 1226 1227 case 0xDB: 1228 if (irm < 0xC0 && (reg == 0 || reg == 5)) 1229 pc = FX; 1230 break; 1231 1232 case 0xDD: 1233 if (irm < 0xC0) 1234 { 1235 if (reg == 0) 1236 pc = FX; 1237 } 1238 else if (irm >= 0xE0 && irm < 0xF0) 1239 pc = FX; 1240 break; 1241 1242 case 0xDF: 1243 if (irm < 0xC0 && (reg == 0 || reg == 5)) 1244 pc = FX; 1245 break; 1246 1247 case 0xFE: 1248 if (reg == 0 || reg == 1) // INC/DEC EA 1249 pc = UV; 1250 break; 1251 case 0xFF: 1252 if (reg == 0 || reg == 1) // INC/DEC EA 1253 pc = UV; 1254 else if (reg == 2 || reg == 4) // CALL/JMP near ptr EA 1255 pc = PE|PV; 1256 else if (reg == 6 && mod == 3) // PUSH reg 1257 pc = PE | UV; 1258 break; 1259 1260 default: 1261 break; 1262 } 1263 if (c.Iflags & CFPREFIX && pc == UV) // if prefix byte 1264 pc = PU; 1265 return pc; 1266 } 1267 1268 /****************************************** 1269 * For an instruction, determine what is read 1270 * and what is written, and what is used for addressing. 1271 * Determine operand size if EA (larger is ok). 1272 */ 1273 1274 @trusted 1275 private void getinfo(out Cinfo ci,code *c) 1276 { 1277 if (!c) 1278 return; 1279 ci.c = c; 1280 1281 if (PRO) 1282 { 1283 ci.uops = cast(ubyte)uops(c); 1284 ci.isz = cast(ubyte)calccodsize(c); 1285 } 1286 else 1287 ci.pair = cast(ubyte)pair_class(c); 1288 1289 ubyte op; 1290 ubyte op2; 1291 ubyte irm,mod,reg,rm; 1292 uint a32; 1293 int pc; 1294 uint r,w; 1295 int sz = I32 ? 4 : 2; 1296 1297 ci.r = 0; 1298 ci.w = 0; 1299 ci.a = 0; 1300 op = c.Iop & 0xFF; 1301 if ((c.Iop & 0xFF00) == 0x0F00) 1302 op = 0x0F; 1303 //printf("\tgetinfo %x, op %x \n",c,op); 1304 pc = pentcycl[op]; 1305 a32 = I32; 1306 if (c.Iflags & CFaddrsize) 1307 a32 ^= 1; 1308 if (c.Iflags & CFopsize) 1309 sz ^= 2 | 4; 1310 irm = c.Irm; 1311 mod = (irm >> 6) & 3; 1312 reg = (irm >> 3) & 7; 1313 rm = irm & 7; 1314 1315 r = oprw[op][0]; 1316 w = oprw[op][1]; 1317 1318 switch (op) 1319 { 1320 case 0x50: 1321 case 0x51: 1322 case 0x52: 1323 case 0x53: 1324 case 0x55: 1325 case 0x56: 1326 case 0x57: // PUSH reg 1327 ci.flags |= CIFL.push; 1328 goto Lpush; 1329 1330 case 0x54: // PUSH ESP 1331 case 0x6A: // PUSH imm8 1332 case 0x68: // PUSH imm 1333 case 0x0E: 1334 case 0x16: 1335 case 0x1E: 1336 case 0x06: 1337 case 0x9C: 1338 Lpush: 1339 ci.spadjust = -sz; 1340 ci.a |= mSP; 1341 break; 1342 1343 case 0x58: 1344 case 0x59: 1345 case 0x5A: 1346 case 0x5B: 1347 case 0x5C: 1348 case 0x5D: 1349 case 0x5E: 1350 case 0x5F: // POP reg 1351 case 0x1F: 1352 case 0x07: 1353 case 0x17: 1354 case 0x9D: // POPF 1355 Lpop: 1356 ci.spadjust = sz; 1357 ci.a |= mSP; 1358 break; 1359 1360 case 0x80: 1361 if (reg == 7) // CMP 1362 c.Iflags |= CFpsw; 1363 r = B | grprw[0][reg][0]; // Grp 1 (byte) 1364 w = B | grprw[0][reg][1]; 1365 break; 1366 1367 case 0x81: 1368 case 0x83: 1369 if (reg == 7) // CMP 1370 c.Iflags |= CFpsw; 1371 else if (irm == modregrm(3,0,SP)) // ADD ESP,imm 1372 { 1373 assert(c.IFL2 == FLconst); 1374 ci.spadjust = (op == 0x81) ? c.IEV2.Vint : cast(byte)c.IEV2.Vint; 1375 } 1376 else if (irm == modregrm(3,5,SP)) // SUB ESP,imm 1377 { 1378 assert(c.IFL2 == FLconst); 1379 ci.spadjust = (op == 0x81) ? -c.IEV2.Vint : -cast(int)cast(byte)c.IEV2.Vint; 1380 } 1381 r = grprw[0][reg][0]; // Grp 1 1382 w = grprw[0][reg][1]; 1383 break; 1384 1385 case 0x8F: 1386 if (reg == 0) // POP rm 1387 goto Lpop; 1388 break; 1389 1390 case 0xA0: 1391 case 0xA1: 1392 case 0xA2: 1393 case 0xA3: 1394 // Fake having an EA to simplify code in conflict() 1395 ci.flags |= CIFL.ea; 1396 ci.reg = 0; 1397 ci.sibmodrm = a32 ? modregrm(0,0,5) : modregrm(0,0,6); 1398 c.IFL1 = c.IFL2; 1399 c.IEV1 = c.IEV2; 1400 break; 1401 1402 case 0xC2: 1403 case 0xC3: 1404 case 0xCA: 1405 case 0xCB: // RET 1406 ci.a |= mSP; 1407 break; 1408 1409 case 0xE8: 1410 if (c.Iflags & CFclassinit) // call to __j_classinit 1411 { r = 0; 1412 w = F; 1413 1414 version (CLASSINIT2) 1415 ci.pair = UV; // it is patched to CMP EAX,0 1416 else 1417 ci.pair = NP; 1418 1419 } 1420 break; 1421 1422 case 0xF6: 1423 r = grprw[3][reg][0]; // Grp 3, byte version 1424 w = grprw[3][reg][1]; 1425 break; 1426 1427 case 0xF7: 1428 r = grprw[1][reg][0]; // Grp 3 1429 w = grprw[1][reg][1]; 1430 break; 1431 1432 case 0x0F: 1433 op2 = c.Iop & 0xFF; 1434 if ((op2 & 0xF0) == 0x80) // if Jxx instructions 1435 { 1436 ci.r = F | N; 1437 ci.w = N; 1438 goto Lret; 1439 } 1440 ci.r = N; 1441 ci.w = N; // copout for now 1442 goto Lret; 1443 1444 case 0xD7: // XLAT 1445 ci.a = mAX | mBX; 1446 break; 1447 1448 case 0xFF: 1449 r = grprw[2][reg][0]; // Grp 5 1450 w = grprw[2][reg][1]; 1451 if (reg == 6) // PUSH rm 1452 goto Lpush; 1453 break; 1454 1455 case 0x38: 1456 case 0x39: 1457 case 0x3A: 1458 case 0x3B: 1459 case 0x3C: // CMP AL,imm8 1460 case 0x3D: // CMP EAX,imm32 1461 // For CMP opcodes, always test for flags 1462 c.Iflags |= CFpsw; 1463 break; 1464 1465 case ESCAPE: 1466 if (c.Iop == (ESCAPE | ESCadjfpu)) 1467 ci.fpuadjust = c.IEV1.Vint; 1468 break; 1469 1470 case 0xD0: 1471 case 0xD1: 1472 case 0xD2: 1473 case 0xD3: 1474 case 0xC0: 1475 case 0xC1: 1476 if (reg == 2 || reg == 3) // if RCL or RCR 1477 c.Iflags |= CFpsw; // always test for flags 1478 break; 1479 1480 case 0xD8: 1481 case 0xD9: 1482 case 0xDA: 1483 case 0xDB: 1484 case 0xDC: 1485 case 0xDD: 1486 case 0xDE: 1487 case 0xDF: 1488 if (irm < 0xC0) 1489 { r = grpf1[op - 0xD8][reg][0]; 1490 w = grpf1[op - 0xD8][reg][1]; 1491 switch (op) 1492 { 1493 case 0xD8: 1494 if (reg == 3) // if FCOMP 1495 ci.fpuadjust = -1; 1496 else 1497 ci.fp_op = FP.fop; 1498 break; 1499 1500 case 0xD9: 1501 if (reg == 0) // if FLD float 1502 { ci.fpuadjust = 1; 1503 ci.fp_op = FP.fld; 1504 } 1505 else if (reg == 3) // if FSTP float 1506 { ci.fpuadjust = -1; 1507 ci.fp_op = FP.fstp; 1508 } 1509 else if (reg == 5 || reg == 7) 1510 sz = 2; 1511 else if (reg == 4 || reg == 6) 1512 sz = 28; 1513 break; 1514 case 0xDA: 1515 if (reg == 3) // if FICOMP 1516 ci.fpuadjust = -1; 1517 break; 1518 case 0xDB: 1519 if (reg == 0 || reg == 5) 1520 { ci.fpuadjust = 1; 1521 ci.fp_op = FP.fld; // FILD / FLD long double 1522 } 1523 if (reg == 3 || reg == 7) 1524 ci.fpuadjust = -1; 1525 if (reg == 7) 1526 ci.fp_op = FP.fstp; // FSTP long double 1527 if (reg == 5 || reg == 7) 1528 sz = 10; 1529 break; 1530 case 0xDC: 1531 sz = 8; 1532 if (reg == 3) // if FCOMP 1533 ci.fpuadjust = -1; 1534 else 1535 ci.fp_op = FP.fop; 1536 break; 1537 case 0xDD: 1538 if (reg == 0) // if FLD double 1539 { ci.fpuadjust = 1; 1540 ci.fp_op = FP.fld; 1541 } 1542 if (reg == 3) // if FSTP double 1543 { ci.fpuadjust = -1; 1544 ci.fp_op = FP.fstp; 1545 } 1546 if (reg == 7) 1547 sz = 2; 1548 else if (reg == 4 || reg == 6) 1549 sz = 108; 1550 else 1551 sz = 8; 1552 break; 1553 case 0xDE: 1554 sz = 2; 1555 if (reg == 3) // if FICOMP 1556 ci.fpuadjust = -1; 1557 break; 1558 case 0xDF: 1559 sz = 2; 1560 if (reg == 4 || reg == 6) 1561 sz = 10; 1562 else if (reg == 5 || reg == 7) 1563 sz = 8; 1564 if (reg == 0 || reg == 4 || reg == 5) 1565 ci.fpuadjust = 1; 1566 else if (reg == 3 || reg == 6 || reg == 7) 1567 ci.fpuadjust = -1; 1568 break; 1569 1570 default: 1571 break; 1572 } 1573 break; 1574 } 1575 else if (op == 0xDE) 1576 { ci.fpuadjust = -1; // pop versions of Fop's 1577 if (irm == 0xD9) 1578 ci.fpuadjust = -2; // FCOMPP 1579 } 1580 1581 // Most floating point opcodes aren't staged, but are 1582 // sent right through, in order to make use of the large 1583 // latencies with floating point instructions. 1584 if (ci.fp_op == FP.fld || 1585 (op == 0xD9 && (irm & 0xF8) == 0xC0)) 1586 { } // FLD ST(i) 1587 else 1588 ci.flags |= CIFL.nostage; 1589 1590 switch (op) 1591 { 1592 case 0xD8: 1593 r = S; 1594 w = C; 1595 if ((irm & ~7) == 0xD0) 1596 w |= S; 1597 break; 1598 case 0xD9: 1599 // FCHS or FABS or FSQRT 1600 if (irm == 0xE0 || irm == 0xE1 || irm == 0xFA) 1601 ci.fp_op = FP.fop; 1602 r = S; 1603 w = S|C; 1604 break; 1605 case 0xDA: 1606 if (irm == 0xE9) // FUCOMPP 1607 { r = S; 1608 w = S|C; 1609 break; 1610 } 1611 break; 1612 case 0xDB: 1613 if (irm == 0xE2) // FCLEX 1614 { r = 0; 1615 w = C; 1616 break; 1617 } 1618 if (irm == 0xE3) // FINIT 1619 { r = 0; 1620 w = S|C; 1621 break; 1622 } 1623 break; 1624 case 0xDC: 1625 case 0xDE: 1626 if ((irm & 0xF0) != 0xD0) 1627 { r = S; 1628 w = S|C; 1629 break; 1630 } 1631 break; 1632 case 0xDD: 1633 // Not entirely correct, but conservative 1634 r = S; 1635 w = S|C; 1636 break; 1637 case 0xDF: 1638 if (irm == 0xE0) // FSTSW AX 1639 { r = C; 1640 w = mAX; 1641 break; 1642 } 1643 break; 1644 1645 default: 1646 break; 1647 } 1648 break; 1649 1650 default: 1651 //printf("\t\tNo special case\n"); 1652 break; 1653 } 1654 1655 if ((r | w) & B) // if byte operation 1656 sz = 1; // operand size is 1 1657 1658 ci.r = r & ~(R | EA); 1659 ci.w = w & ~(R | EA); 1660 if (r & R) 1661 ci.r |= mask((r & B) ? (reg & 3) : reg); 1662 if (w & R) 1663 ci.w |= mask((w & B) ? (reg & 3) : reg); 1664 1665 // OR in bits for EA addressing mode 1666 if ((r | w) & EA) 1667 { ubyte sib; 1668 1669 sib = 0; 1670 switch (mod) 1671 { 1672 case 0: 1673 if (a32) 1674 { 1675 if (rm == 4) 1676 { 1677 sib = c.Isib; 1678 if ((sib & modregrm(0,7,0)) != modregrm(0,4,0)) 1679 ci.a |= mask((sib >> 3) & 7); // index register 1680 if ((sib & 7) != 5) 1681 ci.a |= mask(sib & 7); // base register 1682 } 1683 else if (rm != 5) 1684 ci.a |= mask(rm); 1685 } 1686 else 1687 { 1688 immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,0,mBX]; 1689 ci.a |= ea16[rm]; 1690 } 1691 goto Lmem; 1692 1693 case 1: 1694 case 2: 1695 if (a32) 1696 { 1697 if (rm == 4) 1698 { 1699 sib = c.Isib; 1700 if ((sib & modregrm(0,7,0)) != modregrm(0,4,0)) 1701 ci.a |= mask((sib >> 3) & 7); // index register 1702 ci.a |= mask(sib & 7); // base register 1703 } 1704 else 1705 ci.a |= mask(rm); 1706 } 1707 else 1708 { 1709 immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,mBP,mBX]; 1710 ci.a |= ea16[rm]; 1711 } 1712 1713 Lmem: 1714 if (r & EA) 1715 ci.r |= mMEM; 1716 if (w & EA) 1717 ci.w |= mMEM; 1718 ci.flags |= CIFL.ea; 1719 break; 1720 1721 case 3: 1722 if (r & EA) 1723 ci.r |= mask((r & B) ? (rm & 3) : rm); 1724 if (w & EA) 1725 ci.w |= mask((w & B) ? (rm & 3) : rm); 1726 break; 1727 1728 default: 1729 assert(0); 1730 } 1731 // Adjust sibmodrm so that addressing modes can be compared simply 1732 irm &= modregrm(3,0,7); 1733 if (a32) 1734 { 1735 if (irm != modregrm(0,0,5)) 1736 { 1737 switch (mod) 1738 { 1739 case 0: 1740 if ((sib & 7) != 5) // if not disp32[index] 1741 { 1742 c.IFL1 = FLconst; 1743 c.IEV1.Vpointer = 0; 1744 irm |= 0x80; 1745 } 1746 break; 1747 case 1: 1748 c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer; 1749 irm = modregrm(2, 0, rm); 1750 break; 1751 1752 default: 1753 break; 1754 } 1755 } 1756 } 1757 else 1758 { 1759 if (irm != modregrm(0,0,6)) 1760 { 1761 switch (mod) 1762 { 1763 case 0: 1764 c.IFL1 = FLconst; 1765 c.IEV1.Vpointer = 0; 1766 irm |= 0x80; 1767 break; 1768 case 1: 1769 c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer; 1770 irm = modregrm(2, 0, rm); 1771 break; 1772 1773 default: 1774 break; 1775 } 1776 } 1777 } 1778 1779 ci.r |= ci.a; 1780 ci.reg = reg; 1781 ci.sibmodrm = (sib << 8) | irm; 1782 } 1783 Lret: 1784 if (ci.w & mSP) // if stack pointer is modified 1785 ci.w |= mMEM; // then we are implicitly writing to memory 1786 if (op == LEA) // if LEA 1787 ci.r &= ~mMEM; // memory is not actually read 1788 ci.sz = cast(ubyte)sz; 1789 1790 //printf("\t\t"); ci.print(); 1791 } 1792 1793 /****************************************** 1794 * Determine if two instructions can pair. 1795 * Assume that in general, cu can pair in the U pipe and cv in the V. 1796 * Look for things like register contentions. 1797 * Input: 1798 * cu instruction for U pipe 1799 * cv instruction for V pipe 1800 * Returns: 1801 * !=0 if they can pair 1802 */ 1803 1804 private int pair_test(const ref Cinfo cu, const ref Cinfo cv) 1805 { 1806 uint pcu; 1807 uint pcv; 1808 uint r1,w1; 1809 uint r2,w2; 1810 uint x; 1811 1812 pcu = cu.pair; 1813 if (!(pcu & PU)) 1814 { 1815 // See if pairs with FXCH and cv is FXCH 1816 if (pcu & FX && cv.c.Iop == 0xD9 && (cv.c.Irm & ~7) == 0xC8) 1817 goto Lpair; 1818 goto Lnopair; 1819 } 1820 pcv = cv.pair; 1821 if (!(pcv & PV)) 1822 goto Lnopair; 1823 1824 r1 = cu.r; 1825 w1 = cu.w; 1826 r2 = cv.r; 1827 w2 = cv.w; 1828 1829 x = w1 & (r2 | w2) & ~(F|mMEM); // register contention 1830 if (x && // if register contention 1831 !(x == mSP && pcu & pcv & PE) // and not exception 1832 ) 1833 goto Lnopair; 1834 1835 // Look for flags contention 1836 if (w1 & r2 & F && !(pcv & PF)) 1837 goto Lnopair; 1838 1839 Lpair: 1840 return 1; 1841 1842 Lnopair: 1843 return 0; 1844 } 1845 1846 /****************************************** 1847 * Determine if two instructions have an AGI or register contention. 1848 * Returns: 1849 * !=0 if they have an AGI 1850 */ 1851 1852 private int pair_agi(const ref Cinfo c1, const ref Cinfo c2) pure 1853 { 1854 uint x = c1.w & c2.a; 1855 return x && !(x == mSP && c1.pair & c2.pair & PE); 1856 } 1857 1858 /******************************************** 1859 * Determine if three instructions can decode simultaneously 1860 * in Pentium Pro and Pentium II. 1861 * Input: 1862 * c0,c1,c2 candidates for decoders 0,1,2 1863 * c2 can be null 1864 * Returns: 1865 * !=0 if they can decode simultaneously 1866 */ 1867 1868 private int triple_test(Cinfo *c0, Cinfo *c1, Cinfo *c2) 1869 { 1870 assert(c0); 1871 if (!c1) 1872 return 0; 1873 int c2isz = c2 ? c2.isz : 0; 1874 if (c0.isz > 7 || c1.isz > 7 || c2isz > 7 || 1875 c0.isz + c1.isz + c2isz > 16) 1876 return 0; 1877 1878 // 4-1-1 decode 1879 if (c1.uops > 1 || 1880 (c2 && c2.uops > 1)) 1881 return 0; 1882 1883 return 1; 1884 } 1885 1886 /******************************************** 1887 * Get next instruction worth looking at for scheduling. 1888 * Returns: 1889 * null no more instructions 1890 */ 1891 1892 private code * cnext(code *c) 1893 { 1894 while (1) 1895 { 1896 c = code_next(c); 1897 if (!c) 1898 break; 1899 if (c.Iflags & (CFtarg | CFtarg2)) 1900 break; 1901 if (!(c.Iop == NOP || 1902 c.Iop == (ESCAPE | ESClinnum))) 1903 break; 1904 } 1905 return c; 1906 } 1907 1908 /****************************************** 1909 * Instruction scheduler. 1910 * Input: 1911 * c list of instructions to schedule 1912 * scratch scratch registers we can use 1913 * Returns: 1914 * revised list of scheduled instructions 1915 */ 1916 1917 /////////////////////////////////// 1918 // Determine if c1 and c2 are swappable. 1919 // c1 comes before c2. 1920 // If they do not conflict 1921 // return 0 1922 // If they do conflict 1923 // return 0x100 + delay_clocks 1924 // Input: 1925 // fpsched if 1, then adjust fxch_pre and fxch_post to swap, 1926 // then return 0 1927 // if 2, then adjust ci1 as well as ci2 1928 1929 @trusted 1930 private int conflict(Cinfo *ci1,Cinfo *ci2,int fpsched) 1931 { 1932 code *c1; 1933 code *c2; 1934 uint r1,w1,a1; 1935 uint r2,w2,a2; 1936 int sz1,sz2; 1937 int i = 0; 1938 int delay_clocks; 1939 1940 c1 = ci1.c; 1941 c2 = ci2.c; 1942 1943 //printf("conflict %x %x\n",c1,c2); 1944 1945 r1 = ci1.r; 1946 w1 = ci1.w; 1947 a1 = ci1.a; 1948 sz1 = ci1.sz; 1949 1950 r2 = ci2.r; 1951 w2 = ci2.w; 1952 a2 = ci2.a; 1953 sz2 = ci2.sz; 1954 1955 //printf("r1 %lx w1 %lx a1 %lx sz1 %x\n",r1,w1,a1,sz1); 1956 //printf("r2 %lx w2 %lx a2 %lx sz2 %x\n",r2,w2,a2,sz2); 1957 1958 if ((c1.Iflags | c2.Iflags) & (CFvolatile | CFvex)) 1959 goto Lconflict; 1960 1961 // Determine if we should handle FPU register conflicts separately 1962 //if (fpsched) printf("fp_op %d,%d:\n",ci1.fp_op,ci2.fp_op); 1963 if (fpsched && ci1.fp_op && ci2.fp_op) 1964 { 1965 w1 &= ~(S|C); 1966 r1 &= ~(S|C); 1967 w2 &= ~(S|C); 1968 r2 &= ~(S|C); 1969 } 1970 else 1971 fpsched = 0; 1972 1973 if ((r1 | r2) & N) 1974 { 1975 goto Lconflict; 1976 } 1977 1978 static if (0) 1979 { 1980 if (c1.Iop == 0xFF && c2.Iop == 0x8B) 1981 { c1.print(); c2.print(); i = 1; 1982 printf("r1=%lx, w1=%lx, a1=%lx, sz1=%d, r2=%lx, w2=%lx, a2=%lx, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2); 1983 } 1984 } 1985 L1: 1986 if (w1 & r2 || (r1 | w1) & w2) 1987 { ubyte ifl1,ifl2; 1988 1989 if (i) printf("test\n"); 1990 1991 static if (0) 1992 { 1993 if (c1.IFL1 != c2.IFL1) printf("t1\n"); 1994 if ((c1.Irm & modregrm(3,0,7)) != (c2.Irm & modregrm(3,0,7))) printf("t2\n"); 1995 if ((issib(c1.Irm) && c1.Isib != c2.Isib)) printf("t3\n"); 1996 if (c1.IEV1.Vpointer + sz1 <= c2.IEV1.Vpointer) printf("t4\n"); 1997 if (c2.IEV1.Vpointer + sz2 <= c1.IEV1.Vpointer) printf("t5\n"); 1998 } 1999 2000 // make sure CFpsw is reliably set 2001 if (w1 & w2 & F && // if both instructions write to flags 2002 w1 != F && 2003 w2 != F && 2004 !((r1 | r2) & F) && // but neither instruction reads them 2005 !((c1.Iflags | c2.Iflags) & CFpsw)) // and we don't care about flags 2006 { 2007 w1 &= ~F; 2008 w2 &= ~F; // remove conflict 2009 goto L1; // and try again 2010 } 2011 2012 // If other than the memory reference is a conflict 2013 if (w1 & r2 & ~mMEM || (r1 | w1) & w2 & ~mMEM) 2014 { if (i) printf("\t1\n"); 2015 if (i) printf("r1=%x, w1=%x, a1=%x, sz1=%d, r2=%x, w2=%x, a2=%x, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2); 2016 goto Lconflict; 2017 } 2018 2019 // If referring to distinct types, then no dependency 2020 if (c1.Irex && c2.Irex && c1.Irex != c2.Irex) 2021 goto Lswap; 2022 2023 ifl1 = c1.IFL1; 2024 ifl2 = c2.IFL1; 2025 2026 // Special case: Allow indexed references using registers other than 2027 // ESP and EBP to be swapped with PUSH instructions 2028 if (((c1.Iop & ~7) == 0x50 || // PUSH reg 2029 c1.Iop == 0x6A || // PUSH imm8 2030 c1.Iop == 0x68 || // PUSH imm16/imm32 2031 (c1.Iop == 0xFF && ci1.reg == 6) // PUSH EA 2032 ) && 2033 ci2.flags & CIFL.ea && !(a2 & mSP) && 2034 !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0) 2035 ) 2036 { 2037 if (c1.Iop == 0xFF) 2038 { 2039 if (!(w2 & mMEM)) 2040 goto Lswap; 2041 } 2042 else 2043 goto Lswap; 2044 } 2045 2046 // Special case: Allow indexed references using registers other than 2047 // ESP and EBP to be swapped with PUSH instructions 2048 if (((c2.Iop & ~7) == 0x50 || // PUSH reg 2049 c2.Iop == 0x6A || // PUSH imm8 2050 c2.Iop == 0x68 || // PUSH imm16/imm32 2051 (c2.Iop == 0xFF && ci2.reg == 6) // PUSH EA 2052 ) && 2053 ci1.flags & CIFL.ea && !(a1 & mSP) && 2054 !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0) 2055 ) 2056 { 2057 if (c2.Iop == 0xFF) 2058 { 2059 if (!(w1 & mMEM)) 2060 goto Lswap; 2061 } 2062 else 2063 goto Lswap; 2064 } 2065 2066 // If not both an EA addressing mode, conflict 2067 if (!(ci1.flags & ci2.flags & CIFL.ea)) 2068 { if (i) printf("\t2\n"); 2069 goto Lconflict; 2070 } 2071 2072 if (ci1.sibmodrm == ci2.sibmodrm) 2073 { if (ifl1 != ifl2) 2074 goto Lswap; 2075 switch (ifl1) 2076 { 2077 case FLconst: 2078 if (c1.IEV1.Vint != c2.IEV1.Vint && 2079 (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2080 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint)) 2081 goto Lswap; 2082 break; 2083 case FLdatseg: 2084 if (c1.IEV1.Vseg != c2.IEV1.Vseg || 2085 c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2086 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint) 2087 goto Lswap; 2088 break; 2089 2090 default: 2091 break; 2092 } 2093 } 2094 2095 if ((c1.Iflags | c2.Iflags) & CFunambig && 2096 (ifl1 != ifl2 || 2097 ci1.sibmodrm != ci2.sibmodrm || 2098 (c1.IEV1.Vint != c2.IEV1.Vint && 2099 (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2100 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint) 2101 ) 2102 ) 2103 ) 2104 { 2105 // Assume that [EBP] and [ESP] can point to the same location 2106 if (((a1 | a2) & (mBP | mSP)) == (mBP | mSP)) 2107 goto Lconflict; 2108 goto Lswap; 2109 } 2110 2111 if (i) printf("\t3\n"); 2112 goto Lconflict; 2113 } 2114 2115 Lswap: 2116 if (fpsched) 2117 { 2118 //printf("\tfpsched %d,%d:\n",ci1.fp_op,ci2.fp_op); 2119 ubyte x1 = ci1.fxch_pre; 2120 ubyte y1 = ci1.fxch_post; 2121 ubyte x2 = ci2.fxch_pre; 2122 ubyte y2 = ci2.fxch_post; 2123 2124 static uint X(uint a, uint b) { return (a << 8) | b; } 2125 switch (X(ci1.fp_op,ci2.fp_op)) 2126 { 2127 case X(FP.fstp, FP.fld): 2128 if (x1 || y1) 2129 goto Lconflict; 2130 if (x2) 2131 goto Lconflict; 2132 if (y2 == 0) 2133 ci2.fxch_post++; 2134 else if (y2 == 1) 2135 { 2136 ci2.fxch_pre++; 2137 ci2.fxch_post++; 2138 } 2139 else 2140 { 2141 goto Lconflict; 2142 } 2143 break; 2144 2145 case X(FP.fstp, FP.fop): 2146 if (x1 || y1) 2147 goto Lconflict; 2148 ci2.fxch_pre++; 2149 ci2.fxch_post++; 2150 break; 2151 2152 case X(FP.fop, FP.fop): 2153 if (x1 == 0 && y1 == 1 && x2 == 0 && y2 == 0) 2154 { ci2.fxch_pre = 1; 2155 ci2.fxch_post = 1; 2156 break; 2157 } 2158 if (x1 == 0 && y1 == 0 && x2 == 1 && y2 == 1) 2159 break; 2160 goto Lconflict; 2161 2162 case X(FP.fop, FP.fld): 2163 if (x1 || y1) 2164 goto Lconflict; 2165 if (x2) 2166 goto Lconflict; 2167 if (y2) 2168 break; 2169 else if (fpsched == 2) 2170 ci1.fxch_post = 1; 2171 ci2.fxch_post = 1; 2172 break; 2173 2174 default: 2175 goto Lconflict; 2176 } 2177 2178 //printf("\tpre = %d, post = %d\n",ci2.fxch_pre,ci2.fxch_post); 2179 } 2180 2181 //printf("w1 = x%x, w2 = x%x\n",w1,w2); 2182 if (i) printf("no conflict\n\n"); 2183 return 0; 2184 2185 Lconflict: 2186 //printf("r1=%x, w1=%x, r2=%x, w2=%x\n",r1,w1,r2,w2); 2187 delay_clocks = 0; 2188 2189 // Determine if AGI 2190 if (!PRO && pair_agi(*ci1, *ci2)) 2191 delay_clocks = 1; 2192 2193 // Special delays for floating point 2194 if (fpsched) 2195 { if (ci1.fp_op == FP.fld && ci2.fp_op == FP.fstp) 2196 delay_clocks = 1; 2197 else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fstp) 2198 delay_clocks = 3; 2199 else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fop) 2200 delay_clocks = 2; 2201 } 2202 else if (PRO) 2203 { 2204 // Look for partial register write stalls 2205 if (w1 & r2 & ALLREGS && sz1 < sz2) 2206 delay_clocks = 7; 2207 } 2208 else if ((w1 | r1) & (w2 | r2) & (C | S)) 2209 { 2210 int op = c1.Iop; 2211 int reg = c1.Irm & modregrm(0,7,0); 2212 if (ci1.fp_op == FP.fld || 2213 (op == 0xD9 && (c1.Irm & 0xF8) == 0xC0) 2214 ) 2215 { } // FLD 2216 else if (op == 0xD9 && (c1.Irm & 0xF8) == 0xC8) 2217 { } // FXCH 2218 else if (c2.Iop == 0xD9 && (c2.Irm & 0xF8) == 0xC8) 2219 { } // FXCH 2220 else 2221 delay_clocks = 3; 2222 } 2223 2224 if (i) printf("conflict %d\n\n",delay_clocks); 2225 return 0x100 + delay_clocks; 2226 } 2227 2228 enum TBLMAX = 2*3*20; // must be divisible by both 2 and 3 2229 // (U,V pipe in Pentium, 3 decode units 2230 // in Pentium Pro) 2231 2232 struct Schedule 2233 { 2234 nothrow: 2235 Cinfo*[TBLMAX] tbl; // even numbers are U pipe, odd numbers are V 2236 int tblmax; // max number of slots used 2237 2238 Cinfo[TBLMAX] cinfo; 2239 int cinfomax; 2240 2241 Barray!(Cinfo*) stagelist; // list of instructions in staging area 2242 2243 int fpustackused; // number of slots in FPU stack that are used 2244 2245 @trusted 2246 void initialize(int fpustackinit) // initialize scheduler 2247 { 2248 //printf("Schedule::initialize(fpustackinit = %d)\n", fpustackinit); 2249 memset(&this, 0, Schedule.sizeof); 2250 fpustackused = fpustackinit; 2251 } 2252 2253 @trusted 2254 void dtor() 2255 { 2256 stagelist.dtor(); 2257 } 2258 2259 @trusted 2260 code **assemble(code **pc) // reassemble scheduled instructions 2261 { 2262 code *c; 2263 2264 debug 2265 if (debugs) printf("assemble:\n"); 2266 2267 assert(!*pc); 2268 2269 // Try to insert the rest of the staged instructions 2270 size_t sli; 2271 for (sli = 0; sli < stagelist.length; ++sli) 2272 { 2273 Cinfo* ci = stagelist[sli]; 2274 if (!ci) 2275 continue; 2276 if (!insert(ci)) 2277 break; 2278 } 2279 2280 // Get the instructions out of the schedule table 2281 assert(cast(uint)tblmax <= TBLMAX); 2282 for (int i = 0; i < tblmax; i++) 2283 { 2284 Cinfo* ci = tbl[i]; 2285 2286 debug 2287 if (debugs) 2288 { 2289 if (PRO) 2290 { immutable char[4][3] tbl = [ "0 "," 1 "," 2" ]; 2291 2292 if (ci) 2293 printf("%s %d ",tbl[i - ((i / 3) * 3)].ptr,ci.uops); 2294 else 2295 printf("%s ",tbl[i - ((i / 3) * 3)].ptr); 2296 } 2297 else 2298 { 2299 printf((i & 1) ? " V " : "U "); 2300 } 2301 if (ci) 2302 ci.c.print(); 2303 else 2304 printf("\n"); 2305 } 2306 2307 if (!ci) 2308 continue; 2309 fpustackused += ci.fpuadjust; 2310 //printf("stage()1: fpustackused = %d\n", fpustackused); 2311 c = ci.c; 2312 if (i == 0) 2313 c.Iflags |= CFtarg; // by definition, first is always a jump target 2314 else 2315 c.Iflags &= ~CFtarg; // the rest are not 2316 2317 // Put in any FXCH prefix 2318 if (ci.fxch_pre) 2319 { code *cf; 2320 assert(i); 2321 cf = gen2(null,0xD9,0xC8 + ci.fxch_pre); 2322 *pc = cf; 2323 pc = &cf.next; 2324 } 2325 2326 *pc = c; 2327 do 2328 { 2329 assert(*pc != code_next(*pc)); 2330 pc = &(*pc).next; 2331 } while (*pc); 2332 2333 // Put in any FXCH postfix 2334 if (ci.fxch_post) 2335 { 2336 for (int j = i + 1; j < tblmax; j++) 2337 { if (tbl[j]) 2338 { if (tbl[j].fxch_pre == ci.fxch_post) 2339 { 2340 tbl[j].fxch_pre = 0; // they cancel each other out 2341 goto L1; 2342 } 2343 break; 2344 } 2345 } 2346 { code *cf; 2347 cf = gen2(null,0xD9,0xC8 + ci.fxch_post); 2348 *pc = cf; 2349 pc = &cf.next; 2350 } 2351 } 2352 L1: 2353 } 2354 2355 // Just append any instructions left in the staging area 2356 foreach (ci; stagelist[sli .. stagelist.length]) 2357 { 2358 if (!ci) 2359 continue; 2360 2361 debug 2362 if (debugs) { printf("appending: "); ci.c.print(); } 2363 2364 *pc = ci.c; 2365 do 2366 { 2367 pc = &(*pc).next; 2368 2369 } while (*pc); 2370 fpustackused += ci.fpuadjust; 2371 //printf("stage()2: fpustackused = %d\n", fpustackused); 2372 } 2373 stagelist.setLength(0); 2374 2375 return pc; 2376 } 2377 2378 /****************************** 2379 * Insert c into scheduling table. 2380 * Returns: 2381 * 0 could not be scheduled; have to start a new one 2382 */ 2383 2384 int insert(Cinfo *ci) 2385 { code *c; 2386 int clocks; 2387 int i; 2388 int ic = 0; 2389 int imin; 2390 targ_size_t offset; 2391 targ_size_t vpointer; 2392 int movesp = 0; 2393 int reg2 = -1; // avoid "may be uninitialized" warning 2394 2395 //printf("insert "); ci.c.print(); 2396 //printf("insert() %d\n", fpustackused); 2397 c = ci.c; 2398 //printf("\tc.Iop %x\n",c.Iop); 2399 vpointer = c.IEV1.Vpointer; 2400 assert(cast(uint)tblmax <= TBLMAX); 2401 if (tblmax == TBLMAX) // if out of space 2402 goto Lnoinsert; 2403 if (tblmax == 0) // if table is empty 2404 { // Just stuff it in the first slot 2405 i = tblmax; 2406 goto Linsert; 2407 } 2408 else if (c.Iflags & (CFtarg | CFtarg2)) 2409 // Jump targets can only be first in the scheduler 2410 goto Lnoinsert; 2411 2412 // Special case of: 2413 // PUSH reg1 2414 // MOV reg2,x[ESP] 2415 if (c.Iop == 0x8B && 2416 (c.Irm & modregrm(3,0,7)) == modregrm(1,0,4) && 2417 c.Isib == modregrm(0,4,SP) && 2418 c.IFL1 == FLconst && 2419 (cast(byte)c.IEV1.Vpointer) >= REGSIZE 2420 ) 2421 { 2422 movesp = 1; // this is a MOV reg2,offset[ESP] 2423 offset = cast(byte)c.IEV1.Vpointer; 2424 reg2 = (c.Irm >> 3) & 7; 2425 } 2426 2427 2428 // Start at tblmax, and back up until we get a conflict 2429 ic = -1; 2430 imin = 0; 2431 for (i = tblmax; i >= 0; i--) 2432 { 2433 Cinfo* cit = tbl[i]; 2434 if (!cit) 2435 continue; 2436 2437 // Look for special case swap 2438 if (movesp && 2439 (cit.c.Iop & ~7) == 0x50 && // if PUSH reg1 2440 (cit.c.Iop & 7) != reg2 && // if reg1 != reg2 2441 (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust 2442 ) 2443 { 2444 c.IEV1.Vpointer += cit.spadjust; 2445 //printf("\t1, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer); 2446 continue; 2447 } 2448 2449 if (movesp && 2450 cit.c.Iop == 0x83 && 2451 cit.c.Irm == modregrm(3,5,SP) && // if SUB ESP,offset 2452 cit.c.IFL2 == FLconst && 2453 (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust 2454 ) 2455 { 2456 //printf("\t2, spadjust = %d\n",cit.spadjust); 2457 c.IEV1.Vpointer += cit.spadjust; 2458 continue; 2459 } 2460 2461 clocks = conflict(cit,ci,1); 2462 if (clocks) 2463 { int j; 2464 2465 ic = i; // where the conflict occurred 2466 clocks &= 0xFF; // convert to delay count 2467 2468 // Move forward the delay clocks 2469 if (clocks == 0) 2470 j = i + 1; 2471 else if (PRO) 2472 j = (((i + 3) / 3) * 3) + clocks * 3; 2473 else 2474 { j = ((i + 2) & ~1) + clocks * 2; 2475 2476 // It's possible we skipped over some AGI generating 2477 // instructions due to movesp. 2478 int k; 2479 for (k = i + 1; k < j; k++) 2480 { 2481 if (k >= TBLMAX) 2482 goto Lnoinsert; 2483 if (tbl[k] && pair_agi(*tbl[k], *ci)) 2484 { 2485 k = ((k + 2) & ~1) + 1; 2486 } 2487 } 2488 j = k; 2489 } 2490 2491 if (j >= TBLMAX) // exceed table size? 2492 goto Lnoinsert; 2493 imin = j; // first possible slot c can go in 2494 break; 2495 } 2496 } 2497 2498 2499 // Scan forward looking for a hole to put it in 2500 for (i = imin; i < TBLMAX; i++) 2501 { 2502 if (tbl[i]) 2503 { 2504 // In case, due to movesp, we skipped over some AGI instructions 2505 if (!PRO && pair_agi(*tbl[i], *ci)) 2506 { 2507 i = ((i + 2) & ~1) + 1; 2508 if (i >= TBLMAX) 2509 goto Lnoinsert; 2510 } 2511 } 2512 else 2513 { 2514 if (PRO) 2515 { int i0 = (i / 3) * 3; // index of decode unit 0 2516 Cinfo *ci0; 2517 2518 assert(((TBLMAX / 3) * 3) == TBLMAX); 2519 switch (i - i0) 2520 { 2521 case 0: // i0 can handle any instruction 2522 goto Linsert; 2523 case 1: 2524 ci0 = tbl[i0]; 2525 if (ci.uops > 1) 2526 { 2527 if (i0 >= imin && ci0.uops == 1) 2528 goto L1; 2529 i++; 2530 break; 2531 } 2532 if (triple_test(ci0,ci,tbl[i0 + 2])) 2533 goto Linsert; 2534 break; 2535 case 2: 2536 ci0 = tbl[i0]; 2537 if (ci.uops > 1) 2538 { 2539 if (i0 >= imin && ci0.uops == 1) 2540 { 2541 if (i >= tblmax) 2542 { if (i + 1 >= TBLMAX) 2543 goto Lnoinsert; 2544 tblmax = i + 1; 2545 } 2546 tbl[i0 + 2] = tbl[i0 + 1]; 2547 tbl[i0 + 1] = ci0; 2548 i = i0; 2549 goto Linsert; 2550 } 2551 break; 2552 } 2553 if (triple_test(ci0,tbl[i0 + 1],ci)) 2554 goto Linsert; 2555 break; 2556 default: 2557 assert(0); 2558 } 2559 } 2560 else 2561 { 2562 assert((TBLMAX & 1) == 0); 2563 if (i & 1) // if V pipe 2564 { 2565 if (pair_test(*tbl[i - 1], *ci)) 2566 { 2567 goto Linsert; 2568 } 2569 else if (i > imin && pair_test(*ci, *tbl[i - 1])) 2570 { 2571 L1: 2572 tbl[i] = tbl[i - 1]; 2573 if (i >= tblmax) 2574 tblmax = i + 1; 2575 i--; 2576 //printf("\tswapping with x%02x\n",tbl[i + 1].c.Iop); 2577 goto Linsert; 2578 } 2579 } 2580 else // will always fit in U pipe 2581 { 2582 assert(!tbl[i + 1]); // because V pipe should be empty 2583 goto Linsert; 2584 } 2585 } 2586 } 2587 } 2588 2589 Lnoinsert: 2590 //printf("\tnoinsert\n"); 2591 c.IEV1.Vpointer = vpointer; // reset to original value 2592 return 0; 2593 2594 Linsert: 2595 // Insert at location i 2596 assert(i < TBLMAX); 2597 assert(tblmax <= TBLMAX); 2598 tbl[i] = ci; 2599 //printf("\tinsert at location %d\n",i); 2600 2601 // If it's a scheduled floating point code, we have to adjust 2602 // the FXCH values 2603 if (ci.fp_op) 2604 { 2605 ci.fxch_pre = 0; 2606 ci.fxch_post = 0; // start over again 2607 2608 int fpu = fpustackused; 2609 for (int j = 0; j < tblmax; j++) 2610 { 2611 if (tbl[j]) 2612 { 2613 fpu += tbl[j].fpuadjust; 2614 if (fpu >= 8) // if FPU stack overflow 2615 { tbl[i] = null; 2616 //printf("fpu stack overflow\n"); 2617 goto Lnoinsert; 2618 } 2619 } 2620 } 2621 2622 for (int j = tblmax; j > i; j--) 2623 { 2624 if (j < TBLMAX && tbl[j]) 2625 conflict(tbl[j],ci,2); 2626 } 2627 } 2628 2629 if (movesp) 2630 { // Adjust [ESP] offsets 2631 2632 //printf("\tic = %d, inserting at %d\n",ic,i); 2633 assert(cast(uint)tblmax <= TBLMAX); 2634 for (int j = ic + 1; j < i; j++) 2635 { 2636 Cinfo* cit = tbl[j]; 2637 if (cit) 2638 { 2639 c.IEV1.Vpointer -= cit.spadjust; 2640 //printf("\t3, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer); 2641 } 2642 } 2643 } 2644 if (i >= tblmax) 2645 tblmax = i + 1; 2646 2647 // Now do a hack. Look back at immediately preceding instructions, 2648 // and see if we can swap with a push. 2649 if (0 && movesp) 2650 { 2651 while (1) 2652 { 2653 int j; 2654 for (j = 1; i > j; j++) 2655 if (tbl[i - j]) 2656 break; 2657 2658 if (i >= j && tbl[i - j] && 2659 (tbl[i - j].c.Iop & ~7) == 0x50 && // if PUSH reg1 2660 (tbl[i - j].c.Iop & 7) != reg2 && // if reg1 != reg2 2661 cast(byte)c.IEV1.Vpointer >= REGSIZE) 2662 { 2663 //printf("\t-4 prec, i-j=%d, i=%d\n",i-j,i); 2664 assert(cast(uint)i < TBLMAX); 2665 assert(cast(uint)(i - j) < TBLMAX); 2666 tbl[i] = tbl[i - j]; 2667 tbl[i - j] = ci; 2668 i -= j; 2669 c.IEV1.Vpointer -= REGSIZE; 2670 } 2671 else 2672 break; 2673 } 2674 } 2675 2676 //printf("\tinsert\n"); 2677 return 1; 2678 } 2679 2680 /****************************** 2681 * Insert c into staging area. 2682 * Params: 2683 * c = instruction to stage 2684 * Returns: 2685 * false if could not be scheduled; have to start a new one 2686 */ 2687 2688 @trusted 2689 bool stage(code *c) 2690 { 2691 //printf("stage: "); c.print(); 2692 if (cinfomax == TBLMAX) // if out of space 2693 return false; 2694 auto ci = &cinfo[cinfomax++]; 2695 getinfo(*ci,c); 2696 2697 if (c.Iflags & (CFtarg | CFtarg2 | CFvolatile | CFvex)) 2698 { 2699 // Insert anything in stagelist 2700 foreach (ref cs; stagelist[]) 2701 { 2702 if (cs) 2703 { 2704 if (!insert(cs)) 2705 return false; 2706 cs = null; 2707 } 2708 } 2709 return insert(ci) != 0; 2710 } 2711 2712 // Look through stagelist, and insert any AGI conflicting instructions 2713 bool agi = false; 2714 foreach (ref cs; stagelist[]) 2715 { 2716 if (cs) 2717 { 2718 if (pair_agi(*cs, *ci)) 2719 { 2720 if (!insert(cs)) 2721 goto Lnostage; 2722 cs = null; 2723 agi = true; // we put out an AGI 2724 } 2725 } 2726 } 2727 2728 // Look through stagelist, and insert any other conflicting instructions 2729 foreach (i, ref cs; stagelist[]) 2730 { 2731 if (!cs) 2732 continue; 2733 if (conflict(cs,ci,0) && // if conflict 2734 !(cs.flags & ci.flags & CIFL.push)) 2735 { 2736 if (cs.spadjust) 2737 { 2738 // We need to insert all previous adjustments to ESP 2739 foreach (ref ca; stagelist[0 .. i]) 2740 { 2741 if (ca && ca.spadjust) 2742 { 2743 if (!insert(ca)) 2744 goto Lnostage; 2745 ca = null; 2746 } 2747 } 2748 } 2749 2750 if (!insert(cs)) 2751 goto Lnostage; 2752 cs = null; 2753 } 2754 } 2755 2756 // If floating point opcode, don't stage it, send it right out 2757 if (!agi && ci.flags & CIFL.nostage) 2758 { 2759 if (!insert(ci)) 2760 goto Lnostage; 2761 return true; 2762 } 2763 2764 stagelist.push(ci); // append to staging list 2765 return true; 2766 2767 Lnostage: 2768 return false; 2769 } 2770 2771 } 2772 2773 2774 2775 /******************************************** 2776 * Snip off tail of instruction sequence. 2777 * Returns: 2778 * next instruction (the tail) or 2779 * null for no more instructions 2780 */ 2781 2782 private code * csnip(code *c) 2783 { 2784 if (c) 2785 { 2786 uint iflags = c.Iflags & CFclassinit; 2787 code **pc; 2788 while (1) 2789 { 2790 pc = &c.next; 2791 c = *pc; 2792 if (!c) 2793 break; 2794 if (c.Iflags & (CFtarg | CFtarg2)) 2795 break; 2796 if (!(c.Iop == NOP || 2797 c.Iop == (ESCAPE | ESClinnum) || 2798 c.Iflags & iflags)) 2799 break; 2800 } 2801 *pc = null; 2802 } 2803 return c; 2804 } 2805 2806 2807 /****************************** 2808 * Schedule Pentium instructions, 2809 * based on Steve Russell's algorithm. 2810 */ 2811 2812 @trusted 2813 private code *schedule(code *c,regm_t scratch) 2814 { 2815 code *cresult = null; 2816 code **pctail = &cresult; 2817 Schedule sch = void; 2818 2819 sch.initialize(0); // initialize scheduling table 2820 while (c) 2821 { 2822 if ((c.Iop == NOP || 2823 ((c.Iop & ESCAPEmask) == ESCAPE && c.Iop != (ESCAPE | ESCadjfpu)) || 2824 c.Iflags & CFclassinit) && 2825 !(c.Iflags & (CFtarg | CFtarg2))) 2826 { code *cn; 2827 2828 // Just append this instruction to pctail and go to the next one 2829 *pctail = c; 2830 cn = code_next(c); 2831 c.next = null; 2832 pctail = &c.next; 2833 c = cn; 2834 continue; 2835 } 2836 2837 //printf("init\n"); 2838 sch.initialize(sch.fpustackused); // initialize scheduling table 2839 2840 while (c) 2841 { 2842 //printf("insert %p\n",c); 2843 if (!sch.stage(c)) // store c in scheduling table 2844 break; 2845 c = csnip(c); 2846 } 2847 2848 //printf("assem %d\n",sch.tblmax); 2849 pctail = sch.assemble(pctail); // reassemble instruction stream 2850 } 2851 sch.dtor(); 2852 2853 return cresult; 2854 } 2855 2856 /**************************************************************************/ 2857 2858 /******************************************** 2859 * Replace any occurrence of r1 in EA with r2. 2860 */ 2861 2862 private void repEA(code *c,uint r1,uint r2) 2863 { 2864 uint mod,reg,rm; 2865 uint rmn; 2866 2867 rmn = c.Irm; 2868 mod = rmn & 0xC0; 2869 reg = rmn & modregrm(0,7,0); 2870 rm = rmn & 7; 2871 2872 if (mod == 0xC0 && rm == r1) 2873 { } //c.Irm = mod | reg | r2; 2874 else if (is32bitaddr(I32,c.Iflags) && 2875 // If not disp32 2876 (rmn & modregrm(3,0,7)) != modregrm(0,0,5)) 2877 { 2878 if (rm == 4) 2879 { // SIB byte addressing 2880 uint sib; 2881 uint base; 2882 uint index; 2883 2884 sib = c.Isib; 2885 base = sib & 7; 2886 index = (sib >> 3) & 7; 2887 if (base == r1 && 2888 !(r1 == 5 && mod == 0) && 2889 !(r2 == 5 && mod == 0) 2890 ) 2891 base = r2; 2892 if (index == r1) 2893 index = r2; 2894 c.Isib = cast(ubyte)((sib & 0xC0) | (index << 3) | base); 2895 } 2896 else if (rm == r1) 2897 { 2898 if (r1 == BP && r2 == SP) 2899 { // Replace [EBP] with [ESP] 2900 c.Irm = cast(ubyte)(mod | reg | 4); 2901 c.Isib = modregrm(0,4,SP); 2902 } 2903 else if (r2 == BP && mod == 0) 2904 { 2905 c.Irm = cast(ubyte)(modregrm(1,0,0) | reg | r2); 2906 c.IFL1 = FLconst; 2907 c.IEV1.Vint = 0; 2908 } 2909 else 2910 c.Irm = cast(ubyte)(mod | reg | r2); 2911 } 2912 } 2913 } 2914 2915 /****************************************** 2916 * Instruction scheduler. 2917 * Input: 2918 * c list of instructions to schedule 2919 * scratch scratch registers we can use 2920 * Returns: 2921 * revised list of scheduled instructions 2922 */ 2923 2924 /****************************************** 2925 * Swap c1 and c2. 2926 * c1 comes before c2. 2927 * Swap in place to not disturb addresses of jmp targets 2928 */ 2929 2930 private void code_swap(code *c1,code *c2) 2931 { code cs; 2932 2933 // Special case of: 2934 // PUSH reg1 2935 // MOV reg2,x[ESP] 2936 //printf("code_swap(%x, %x)\n",c1,c2); 2937 if ((c1.Iop & ~7) == 0x50 && 2938 c2.Iop == 0x8B && 2939 (c2.Irm & modregrm(3,0,7)) == modregrm(1,0,4) && 2940 c2.Isib == modregrm(0,4,SP) && 2941 c2.IFL1 == FLconst && 2942 (cast(byte)c2.IEV1.Vpointer) >= REGSIZE && 2943 (c1.Iop & 7) != ((c2.Irm >> 3) & 7) 2944 ) 2945 c2.IEV1.Vpointer -= REGSIZE; 2946 2947 2948 cs = *c2; 2949 *c2 = *c1; 2950 *c1 = cs; 2951 // Retain original CFtarg 2952 c1.Iflags = (c1.Iflags & ~(CFtarg | CFtarg2)) | (c2.Iflags & (CFtarg | CFtarg2)); 2953 c2.Iflags = (c2.Iflags & ~(CFtarg | CFtarg2)) | (cs.Iflags & (CFtarg | CFtarg2)); 2954 2955 c1.next = c2.next; 2956 c2.next = cs.next; 2957 } 2958 2959 private code *peephole(code *cstart,regm_t scratch) 2960 { 2961 // Look for cases of: 2962 // MOV r1,r2 2963 // OP ?,r1 2964 // we can replace with: 2965 // MOV r1,r2 2966 // OP ?,r2 2967 // to improve pairing 2968 code *c1; 2969 uint r1,r2; 2970 uint mod,reg,rm; 2971 2972 //printf("peephole\n"); 2973 for (code *c = cstart; c; c = c1) 2974 { 2975 ubyte rmn; 2976 2977 //c.print(); 2978 c1 = cnext(c); 2979 Ln: 2980 if (!c1) 2981 break; 2982 if (c1.Iflags & (CFtarg | CFtarg2)) 2983 continue; 2984 2985 // Do: 2986 // PUSH reg 2987 if (I32 && (c.Iop & ~7) == 0x50) 2988 { 2989 uint regx = c.Iop & 7; 2990 2991 // MOV [ESP],regx => NOP 2992 if (c1.Iop == 0x8B && 2993 c1.Irm == modregrm(0,regx,4) && 2994 c1.Isib == modregrm(0,4,SP)) 2995 { c1.Iop = NOP; 2996 continue; 2997 } 2998 2999 // PUSH [ESP] => PUSH regx 3000 if (c1.Iop == 0xFF && 3001 c1.Irm == modregrm(0,6,4) && 3002 c1.Isib == modregrm(0,4,SP)) 3003 { c1.Iop = 0x50 + regx; 3004 continue; 3005 } 3006 3007 // CMP [ESP],imm => CMP regx,i,, 3008 if (c1.Iop == 0x83 && 3009 c1.Irm == modregrm(0,7,4) && 3010 c1.Isib == modregrm(0,4,SP)) 3011 { c1.Irm = modregrm(3,7,regx); 3012 if (c1.IFL2 == FLconst && cast(byte)c1.IEV2.Vuns == 0) 3013 { // to TEST regx,regx 3014 c1.Iop = (c1.Iop & 1) | 0x84; 3015 c1.Irm = modregrm(3,regx,regx); 3016 } 3017 continue; 3018 } 3019 3020 } 3021 3022 // Do: 3023 // MOV reg,[ESP] => PUSH reg 3024 // ADD ESP,4 => NOP 3025 if (I32 && c.Iop == 0x8B && (c.Irm & 0xC7) == modregrm(0,0,4) && 3026 c.Isib == modregrm(0,4,SP) && 3027 c1.Iop == 0x83 && (c1.Irm & 0xC7) == modregrm(3,0,SP) && 3028 !(c1.Iflags & CFpsw) && c1.IFL2 == FLconst && c1.IEV2.Vint == 4) 3029 { 3030 uint regx = (c.Irm >> 3) & 7; 3031 c.Iop = 0x58 + regx; 3032 c1.Iop = NOP; 3033 continue; 3034 } 3035 3036 // Combine two SUBs of the same register 3037 if (c.Iop == c1.Iop && 3038 c.Iop == 0x83 && 3039 (c.Irm & 0xC0) == 0xC0 && 3040 (c.Irm & modregrm(3,0,7)) == (c1.Irm & modregrm(3,0,7)) && 3041 !(c1.Iflags & CFpsw) && 3042 c.IFL2 == FLconst && c1.IFL2 == FLconst 3043 ) 3044 { int i = cast(byte)c.IEV2.Vint; 3045 int i1 = cast(byte)c1.IEV2.Vint; 3046 switch ((c.Irm & modregrm(0,7,0)) | ((c1.Irm & modregrm(0,7,0)) >> 3)) 3047 { 3048 case (0 << 3) | 0: // ADD, ADD 3049 case (5 << 3) | 5: // SUB, SUB 3050 i += i1; 3051 goto Laa; 3052 case (0 << 3) | 5: // ADD, SUB 3053 case (5 << 3) | 0: // SUB, ADD 3054 i -= i1; 3055 goto Laa; 3056 Laa: 3057 if (cast(byte)i != i) 3058 c.Iop &= ~2; 3059 c.IEV2.Vint = i; 3060 c1.Iop = NOP; 3061 if (i == 0) 3062 c.Iop = NOP; 3063 continue; 3064 3065 default: 3066 break; 3067 } 3068 } 3069 3070 if (c.Iop == 0x8B && (c.Irm & 0xC0) == 0xC0) // MOV r1,r2 3071 { r1 = (c.Irm >> 3) & 7; 3072 r2 = c.Irm & 7; 3073 } 3074 else if (c.Iop == 0x89 && (c.Irm & 0xC0) == 0xC0) // MOV r1,r2 3075 { r1 = c.Irm & 7; 3076 r2 = (c.Irm >> 3) & 7; 3077 } 3078 else 3079 { 3080 continue; 3081 } 3082 3083 rmn = c1.Irm; 3084 mod = rmn & 0xC0; 3085 reg = rmn & modregrm(0,7,0); 3086 rm = rmn & 7; 3087 if (c1.hasModregrm()) 3088 repEA(c1,r1,r2); 3089 switch (c1.Iop) 3090 { 3091 case 0x50: 3092 case 0x51: 3093 case 0x52: 3094 case 0x53: 3095 case 0x54: 3096 case 0x55: 3097 case 0x56: 3098 case 0x57: // PUSH reg 3099 if ((c1.Iop & 7) == r1) 3100 { c1.Iop = 0x50 | r2; 3101 //printf("schedule PUSH reg\n"); 3102 } 3103 break; 3104 3105 case 0x81: 3106 case 0x83: 3107 // Look for CMP EA,imm 3108 if (reg == modregrm(0,7,0)) 3109 { 3110 if (mod == 0xC0 && rm == r1) 3111 c1.Irm = cast(ubyte)(mod | reg | r2); 3112 } 3113 break; 3114 3115 case 0x84: // TEST reg,byte ptr EA 3116 if (r1 >= 4 || r2 >= 4) // if not a byte register 3117 break; 3118 if ((rmn & 0xC0) == 0xC0) 3119 { 3120 if ((rmn & 3) == r1) 3121 { c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,4)) | r2); 3122 //printf("schedule 1\n"); 3123 } 3124 } 3125 if ((rmn & modregrm(0,3,0)) == modregrm(0,r1,0)) 3126 { c1.Irm = (rmn & modregrm(3,4,7)) | modregrm(0,r2,0); 3127 //printf("schedule 2\n"); 3128 } 3129 break; 3130 case 0x85: // TEST reg,word ptr EA 3131 if ((rmn & 0xC0) == 0xC0) 3132 { 3133 if ((rmn & 7) == r1) 3134 { c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,0)) | r2); 3135 //printf("schedule 3\n"); 3136 } 3137 } 3138 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0)) 3139 { c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0); 3140 //printf("schedule 4\n"); 3141 } 3142 break; 3143 3144 case 0x89: // MOV EA,reg 3145 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0)) 3146 { c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0); 3147 //printf("schedule 5\n"); 3148 if (c1.Irm == modregrm(3,r2,r2)) 3149 goto Lnop; 3150 } 3151 break; 3152 3153 case 0x8B: // MOV reg,EA 3154 if ((rmn & 0xC0) == 0xC0 && 3155 (rmn & 7) == r1) // if EA == r1 3156 { c1.Irm = cast(ubyte)((rmn & modregrm(3,7,0)) | r2); 3157 //printf("schedule 6\n"); 3158 if (c1.Irm == modregrm(3,r2,r2)) 3159 goto Lnop; 3160 } 3161 break; 3162 3163 case 0x3C: // CMP AL,imm8 3164 if (r1 == AX && r2 < 4) 3165 { c1.Iop = 0x80; 3166 c1.Irm = modregrm(3,7,r2); 3167 //printf("schedule 7, r2 = %d\n", r2); 3168 } 3169 break; 3170 3171 case 0x3D: // CMP AX,imm16 3172 if (r1 == AX) 3173 { c1.Iop = 0x81; 3174 c1.Irm = modregrm(3,7,r2); 3175 if (c1.IFL2 == FLconst && 3176 c1.IEV2.Vuns == cast(byte)c1.IEV2.Vuns) 3177 c1.Iop = 0x83; 3178 //printf("schedule 8\n"); 3179 } 3180 break; 3181 3182 default: 3183 break; 3184 } 3185 continue; 3186 Lnop: 3187 c1.Iop = NOP; 3188 c1 = cnext(c1); 3189 goto Ln; 3190 } 3191 return cstart; 3192 } 3193 3194 /*****************************************************************/ 3195 3196 /********************************************** 3197 * Replace complex instructions with simple ones more conducive 3198 * to scheduling. 3199 */ 3200 3201 @trusted 3202 code *simpleops(code *c,regm_t scratch) 3203 { code *cstart; 3204 uint reg; 3205 code *c2; 3206 3207 // Worry about using registers not saved yet by prolog 3208 scratch &= ~fregsaved; 3209 3210 if (!(scratch & (scratch - 1))) // if 0 or 1 registers 3211 return c; 3212 3213 reg = findreg(scratch); 3214 3215 cstart = c; 3216 for (code** pc = &cstart; *pc; pc = &(*pc).next) 3217 { 3218 c = *pc; 3219 if (c.Iflags & (CFtarg | CFtarg2 | CFopsize)) 3220 continue; 3221 if (c.Iop == 0x83 && 3222 (c.Irm & modregrm(0,7,0)) == modregrm(0,7,0) && 3223 (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0) 3224 ) 3225 { // Replace CMP mem,imm with: 3226 // MOV reg,mem 3227 // CMP reg,imm 3228 targ_long imm; 3229 3230 //printf("replacing CMP\n"); 3231 c.Iop = 0x8B; 3232 c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0); 3233 3234 c2 = code_calloc(); 3235 if (reg == AX) 3236 c2.Iop = 0x3D; 3237 else 3238 { c2.Iop = 0x83; 3239 c2.Irm = modregrm(3,7,reg); 3240 } 3241 c2.IFL2 = c.IFL2; 3242 c2.IEV2 = c.IEV2; 3243 3244 // See if c2 should be replaced by a TEST 3245 imm = c2.IEV2.Vuns; 3246 if (!(c2.Iop & 1)) 3247 imm &= 0xFF; 3248 else if (I32 ? c.Iflags & CFopsize : !(c.Iflags & CFopsize)) 3249 imm = cast(short) imm; 3250 if (imm == 0) 3251 { 3252 c2.Iop = 0x85; // TEST reg,reg 3253 c2.Irm = modregrm(3,reg,reg); 3254 } 3255 goto L1; 3256 } 3257 else if (c.Iop == 0xFF && 3258 (c.Irm & modregrm(0,7,0)) == modregrm(0,6,0) && 3259 (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0) 3260 ) 3261 { // Replace PUSH mem with: 3262 // MOV reg,mem 3263 // PUSH reg 3264 3265 // printf("replacing PUSH\n"); 3266 c.Iop = 0x8B; 3267 c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0); 3268 3269 c2 = gen1(null,0x50 + reg); 3270 L1: 3271 //c.print(); 3272 //c2.print(); 3273 c2.next = c.next; 3274 c.next = c2; 3275 3276 // Switch to another reg 3277 if (scratch & ~mask(reg)) 3278 reg = findreg(scratch & ~mask(reg)); 3279 } 3280 } 3281 return cstart; 3282 }