-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathooocore.h
2033 lines (1766 loc) · 58 KB
/
ooocore.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// -*- c++ -*-
//
// PTLsim: Cycle Accurate x86-64 Simulator
// Out-of-Order Core Simulator
//
// Copyright 2003-2008 Matt T. Yourst <yourst@yourst.com>
// Copyright 2006-2008 Hui Zeng <hzeng@cs.binghamton.edu>
//
#ifndef _OOOCORE_H_
#define _OOOCORE_H_
// With these disabled, simulation is faster
#define ENABLE_CHECKS
#define ENABLE_LOGGING
//
// Enable SMT operation:
//
// Note that this limits some configurations of resources and
// issue queues that would normally be possible in single
// threaded mode.
//
#ifdef PTLSIM_HYPERVISOR
#define ENABLE_SMT
#endif
static const int MAX_THREADS_BIT = 4; // up to 16 threads
static const int MAX_ROB_IDX_BIT = 12; // up to 4096 ROB entries
#ifdef ENABLE_SMT
static const int MAX_THREADS_PER_CORE = 2;
#else
static const int MAX_THREADS_PER_CORE = 1;
#endif
#define per_context_ooocore_stats_ref(vcpuid) (*(((PerContextOutOfOrderCoreStats*)&stats.ooocore.vcpu0) + (vcpuid)))
#define per_context_ooocore_stats_update(vcpuid, expr) stats.ooocore.total.expr, per_context_ooocore_stats_ref(vcpuid).expr
namespace OutOfOrderModel {
//
// Operand formats
//
static const int MAX_OPERANDS = 4;
static const int RA = 0;
static const int RB = 1;
static const int RC = 2;
static const int RS = 3; // (for stores only)
//
// Uop to functional unit mappings
//
static const int FU_COUNT = 8;
static const int LOADLAT = 2;
enum {
FU_LDU0 = (1 << 0),
FU_STU0 = (1 << 1),
FU_LDU1 = (1 << 2),
FU_STU1 = (1 << 3),
FU_ALU0 = (1 << 4),
FU_FPU0 = (1 << 5),
FU_ALU1 = (1 << 6),
FU_FPU1 = (1 << 7),
};
static const int LOAD_FU_COUNT = 2;
const char* fu_names[FU_COUNT] = {
"ldu0",
"stu0",
"ldu1",
"stu1",
"alu0",
"fpu0",
"alu1",
"fpu1",
};
//
// Opcodes and properties
//
#define ALU0 FU_ALU0
#define ALU1 FU_ALU1
#define STU0 FU_STU0
#define STU1 FU_STU1
#define LDU0 FU_LDU0
#define LDU1 FU_LDU1
#define FPU0 FU_FPU0
#define FPU1 FU_FPU1
#define A 1 // ALU latency, assuming fast bypass
#define L LOADLAT
#define ANYALU ALU0|ALU1
#define ANYLDU LDU0|LDU1
#define ANYSTU STU0|STU1
#define ANYFPU FPU0|FPU1
#define ANYINT ANYALU|ANYSTU|ANYLDU
struct FunctionalUnitInfo {
byte opcode; // Must match definition in ptlhwdef.h and ptlhwdef.cpp!
byte latency; // Latency in cycles, assuming ideal bypass
W16 fu; // Map of functional units on which this uop can issue
};
//
// WARNING: This table MUST be kept in sync with the table
// in ptlhwdef.cpp and the uop enum in ptlhwdef.h!
//
const FunctionalUnitInfo fuinfo[OP_MAX_OPCODE] = {
// name, latency, fumask
{OP_nop, A, ANYINT|ANYFPU},
{OP_mov, A, ANYINT|ANYFPU},
// Logical
{OP_and, A, ANYINT|ANYFPU},
{OP_andnot, A, ANYINT|ANYFPU},
{OP_xor, A, ANYINT|ANYFPU},
{OP_or, A, ANYINT|ANYFPU},
{OP_nand, A, ANYINT|ANYFPU},
{OP_ornot, A, ANYINT|ANYFPU},
{OP_eqv, A, ANYINT|ANYFPU},
{OP_nor, A, ANYINT|ANYFPU},
// Mask, insert or extract bytes
{OP_maskb, A, ANYINT},
// Add and subtract
{OP_add, A, ANYINT},
{OP_sub, A, ANYINT},
{OP_adda, A, ANYINT},
{OP_suba, A, ANYINT},
{OP_addm, A, ANYINT},
{OP_subm, A, ANYINT},
// Condition code logical ops
{OP_andcc, A, ANYINT},
{OP_orcc, A, ANYINT},
{OP_xorcc, A, ANYINT},
{OP_ornotcc, A, ANYINT},
// Condition code movement and merging
{OP_movccr, A, ANYINT},
{OP_movrcc, A, ANYINT},
{OP_collcc, A, ANYINT},
// Simple shifting (restricted to small immediate 1..8)
{OP_shls, A, ANYINT},
{OP_shrs, A, ANYINT},
{OP_bswap, A, ANYINT},
{OP_sars, A, ANYINT},
// Bit testing
{OP_bt, A, ANYALU},
{OP_bts, A, ANYALU},
{OP_btr, A, ANYALU},
{OP_btc, A, ANYALU},
// Set and select
{OP_set, A, ANYINT},
{OP_set_sub, A, ANYINT},
{OP_set_and, A, ANYINT},
{OP_sel, A, ANYINT},
{OP_sel_cmp, A, ANYINT},
// Branches
{OP_br, A, ANYINT},
{OP_br_sub, A, ANYINT},
{OP_br_and, A, ANYINT},
{OP_jmp, A, ANYINT},
{OP_bru, A, ANYINT},
{OP_jmpp, A, ANYALU|ANYLDU},
{OP_brp, A, ANYALU|ANYLDU},
// Checks
{OP_chk, A, ANYINT},
{OP_chk_sub, A, ANYINT},
{OP_chk_and, A, ANYINT},
// Loads and stores
{OP_ld, L, ANYLDU},
{OP_ldx, L, ANYLDU},
{OP_ld_pre, 1, ANYLDU},
{OP_ld_a16, L, ANYLDU},
{OP_st, 1, ANYSTU},
{OP_st_a16, 1, ANYSTU},
{OP_mf, 1, STU0 },
// Shifts, rotates and complex masking
{OP_shl, A, ANYALU},
{OP_shr, A, ANYALU},
{OP_mask, A, ANYALU},
{OP_sar, A, ANYALU},
{OP_rotl, A, ANYALU},
{OP_rotr, A, ANYALU},
{OP_rotcl, A, ANYALU},
{OP_rotcr, A, ANYALU},
// Multiplication
{OP_mull, 4, ANYFPU},
{OP_mulh, 4, ANYFPU},
{OP_mulhu, 4, ANYFPU},
{OP_mulhl, 4, ANYFPU},
// Bit scans
{OP_ctz, 3, ANYFPU},
{OP_clz, 3, ANYFPU},
{OP_ctpop, 3, ANYFPU},
{OP_permb, 4, ANYFPU},
// Integer divide and remainder step
{OP_div, 32, ALU0},
{OP_rem, 32, ALU0},
{OP_divs, 32, ALU0},
{OP_rems, 32, ALU0},
// Minimum and maximum
{OP_min, A, ANYALU},
{OP_max, A, ANYALU},
{OP_min_s, A, ANYALU},
{OP_max_s, A, ANYALU},
// Floating point
// uop.size bits have following meaning:
// 00 = single precision, scalar (preserve high 32 bits of ra)
// 01 = single precision, packed (two 32-bit floats)
// 1x = double precision, scalar or packed (use two uops to process 128-bit xmm)
{OP_fadd, 6, ANYFPU},
{OP_fsub, 6, ANYFPU},
{OP_fmul, 6, ANYFPU},
{OP_fmadd, 6, ANYFPU},
{OP_fmsub, 6, ANYFPU},
{OP_fmsubr, 6, ANYFPU},
{OP_fdiv, 6, ANYFPU},
{OP_fsqrt, 6, ANYFPU},
{OP_frcp, 6, ANYFPU},
{OP_frsqrt, 6, ANYFPU},
{OP_fmin, 6, ANYFPU},
{OP_fmax, 6, ANYFPU},
{OP_fcmp, 6, ANYFPU},
// For fcmpcc, uop.size bits have following meaning:
// 00 = single precision ordered compare
// 01 = single precision unordered compare
// 10 = double precision ordered compare
// 11 = double precision unordered compare
{OP_fcmpcc, 4, ANYFPU},
// and/andn/or/xor are done using integer uops
// For these conversions, uop.size bits select truncation mode:
// x0 = normal IEEE-style rounding
// x1 = truncate to zero
{OP_fcvt_i2s_ins, 6, ANYFPU},
{OP_fcvt_i2s_p, 6, ANYFPU},
{OP_fcvt_i2d_lo, 6, ANYFPU},
{OP_fcvt_i2d_hi, 6, ANYFPU},
{OP_fcvt_q2s_ins, 6, ANYFPU},
{OP_fcvt_q2d, 6, ANYFPU},
{OP_fcvt_s2i, 6, ANYFPU},
{OP_fcvt_s2q, 6, ANYFPU},
{OP_fcvt_s2i_p, 6, ANYFPU},
{OP_fcvt_d2i, 6, ANYFPU},
{OP_fcvt_d2q, 6, ANYFPU},
{OP_fcvt_d2i_p, 6, ANYFPU},
{OP_fcvt_d2s_ins, 6, ANYFPU},
{OP_fcvt_d2s_p, 6, ANYFPU},
{OP_fcvt_s2d_lo, 6, ANYFPU},
{OP_fcvt_s2d_hi, 6, ANYFPU},
// Vector integer uops
// uop.size defines element size: 00 = byte, 01 = W16, 10 = W32, 11 = W64 (i.e. same as normal ALU uops)
{OP_vadd, 1, ANYFPU},
{OP_vsub, 1, ANYFPU},
{OP_vadd_us, 1, ANYFPU},
{OP_vsub_us, 1, ANYFPU},
{OP_vadd_ss, 1, ANYFPU},
{OP_vsub_ss, 1, ANYFPU},
{OP_vshl, 1, ANYFPU},
{OP_vshr, 1, ANYFPU},
{OP_vbt, 1, ANYFPU},
{OP_vsar, 1, ANYFPU},
{OP_vavg, 1, ANYFPU},
{OP_vcmp, 1, ANYFPU},
{OP_vmin, 1, ANYFPU},
{OP_vmax, 1, ANYFPU},
{OP_vmin_s, 1, ANYFPU},
{OP_vmax_s, 1, ANYFPU},
{OP_vmull, 4, ANYFPU},
{OP_vmulh, 4, ANYFPU},
{OP_vmulhu, 4, ANYFPU},
{OP_vmaddp, 4, ANYFPU},
{OP_vsad, 4, ANYFPU},
{OP_vpack_us, 2, ANYFPU},
{OP_vpack_ss, 2, ANYFPU},
};
#undef A
#undef L
#undef F
#undef ALU0
#undef ALU1
#undef STU0
#undef STU1
#undef LDU0
#undef LDU1
#undef FPU0
#undef FPU1
#undef L
#undef ANYALU
#undef ANYLDU
#undef ANYSTU
#undef ANYFPU
#undef ANYINT
//
// Global limits
//
const int MAX_ISSUE_WIDTH = 4;
// Largest size of any physical register file or the store queue:
const int MAX_PHYS_REG_FILE_SIZE = 256;
const int PHYS_REG_FILE_SIZE = 256;
const int PHYS_REG_NULL = 0;
//
// IMPORTANT! If you change this to be greater than 256, you MUST
// #define BIG_ROB below to use the correct associative search logic
// (16-bit tags vs 8-bit tags).
//
// SMT always has BIG_ROB enabled: high 4 bits are used for thread id
//
#define BIG_ROB
const int ROB_SIZE = 128;
// Maximum number of branches in the pipeline at any given time
const int MAX_BRANCHES_IN_FLIGHT = 16;
// Set this to combine the integer and FP phys reg files:
// #define UNIFIED_INT_FP_PHYS_REG_FILE
#ifdef UNIFIED_INT_FP_PHYS_REG_FILE
// unified, br, st
const int PHYS_REG_FILE_COUNT = 3;
#else
// int, fp, br, st
const int PHYS_REG_FILE_COUNT = 4;
#endif
//
// Load and Store Queues
//
const int LDQ_SIZE = 48;
const int STQ_SIZE = 32;
//
// Fetch
//
const int FETCH_QUEUE_SIZE = 32;
const int FETCH_WIDTH = 4;
//
// Frontend (Rename and Decode)
//
const int FRONTEND_WIDTH = 4;
const int FRONTEND_STAGES = 5;
//
// Dispatch
//
const int DISPATCH_WIDTH = 4;
//
// Writeback
//
const int WRITEBACK_WIDTH = 4;
//
// Commit
//
const int COMMIT_WIDTH = 4;
//
// Clustering, Issue Queues and Bypass Network
//
const int MAX_FORWARDING_LATENCY = 2;
#define MULTI_IQ
#ifdef ENABLE_SMT
//
// Multiple issue queues are currently only supported in
// the non-SMT configuration, due to ambiguities in the
// ICOUNT SMT heuristic when multiple queues are active.
//
#undef MULTI_IQ
#endif
#ifdef MULTI_IQ
const int MAX_CLUSTERS = 4;
#else
const int MAX_CLUSTERS = 1;
#endif
enum { PHYSREG_NONE, PHYSREG_FREE, PHYSREG_WAITING, PHYSREG_BYPASS, PHYSREG_WRITTEN, PHYSREG_ARCH, PHYSREG_PENDINGFREE, MAX_PHYSREG_STATE };
static const char* physreg_state_names[MAX_PHYSREG_STATE] = {"none", "free", "waiting", "bypass", "written", "arch", "pendingfree"};
static const char* short_physreg_state_names[MAX_PHYSREG_STATE] = {"-", "free", "wait", "byps", "wrtn", "arch", "pend"};
#ifdef INSIDE_OOOCORE
struct OutOfOrderCore;
OutOfOrderCore& coreof(int coreid);
struct ReorderBufferEntry;
//
// Issue queue based scheduler with broadcast
//
#ifdef BIG_ROB
typedef W16 issueq_tag_t;
#else
typedef byte issueq_tag_t;
#endif
template <int size, int operandcount = MAX_OPERANDS>
struct IssueQueue {
#ifdef BIG_ROB
typedef FullyAssociativeTags16bit<size, size> assoc_t;
typedef vec8w vec_t;
#else
typedef FullyAssociativeTags8bit<size, size> assoc_t;
typedef vec16b vec_t;
#endif
typedef issueq_tag_t tag_t;
static const int SIZE = size;
assoc_t uopids;
assoc_t tags[operandcount];
// States:
// V I
// free 0 0
// dispatched 1 0
// issued 1 1
// complete 0 1
bitvec<size> valid;
bitvec<size> issued;
bitvec<size> allready;
int count;
byte coreid;
int shared_entries;
int reserved_entries;
void set_reserved_entries(int num) { reserved_entries = num; }
bool reset_shared_entries() {
shared_entries = size - reserved_entries;
return true;
}
bool alloc_reserved_entry() {
assert(shared_entries > 0);
shared_entries--;
return true;
}
bool free_shared_entry() {
assert(shared_entries < size - reserved_entries);
shared_entries++;
return true;
}
bool shared_empty() {
return (shared_entries == 0);
}
bool remaining() const { return (size - count); }
bool empty() const { return (!count); }
bool full() const { return (!remaining()); }
int uopof(int slot) const {
return uopids[slot];
}
int slotof(int uopid) const {
return uopids.search(uopid);
}
void reset(int coreid);
void reset(int coreid, int threadid);
void clock();
bool insert(tag_t uopid, const tag_t* operands, const tag_t* preready);
bool broadcast(tag_t uopid);
int issue();
bool replay(int slot, const tag_t* operands, const tag_t* preready);
bool switch_to_end(int slot, const tag_t* operands, const tag_t* preready);
bool remove(int slot);
ostream& print(ostream& os) const;
void tally_broadcast_matches(tag_t sourceid, const bitvec<size>& mask, int operand) const;
//
// Replay a uop that has already issued once.
// The caller may add or reset dependencies here as needed.
//
bool replay(int slot) {
issued[slot] = 0;
return true;
}
//
// Remove an entry from the issue queue after it has completed,
// or in the process of annulment.
//
bool release(int slot) {
remove(slot);
return true;
}
bool annul(int slot) {
remove(slot);
return true;
}
bool annuluop(int uopid) {
int slot = slotof(uopid);
if (slot < 0) return false;
remove(slot);
return true;
}
OutOfOrderCore& getcore() const { return coreof(coreid); }
};
template <int size, int operandcount>
static inline ostream& operator <<(ostream& os, const IssueQueue<size, operandcount>& issueq) {
return issueq.print(os);
}
//
// Iterate through a linked list of objects where each object directly inherits
// only from the selfqueuelink class or otherwise has a selfqueuelink object
// as the first member.
//
// This iterator supports mutable lists, meaning the current entry (obj) may
// be safely removed from the list and/or moved to some other list without
// affecting the next object processed.
//
// This does NOT mean you can remove any object from the list other than the
// current object obj - to do this, copy the list of pointers to an array and
// then process that instead.
//
#define foreach_list_mutable_linktype(L, obj, entry, nextentry, linktype) \
linktype* entry; \
linktype* nextentry; \
for (entry = (L).next, nextentry = entry->next, prefetch(entry->next), obj = (typeof(obj))entry; \
entry != &(L); entry = nextentry, nextentry = entry->next, prefetch(nextentry), obj = (typeof(obj))entry)
#define foreach_list_mutable(L, obj, entry, nextentry) foreach_list_mutable_linktype(L, obj, entry, nextentry, selfqueuelink)
struct StateList;
struct ListOfStateLists: public array<StateList*, 64> {
int count;
ListOfStateLists() { count = 0; }
int add(StateList* list);
void reset();
};
struct StateList: public selfqueuelink {
char* name;
int count;
int listid;
W64 dispatch_source_counter;
W64 issue_source_counter;
W32 flags;
StateList() { count = 0; listid = 0; }
void init(const char* name, ListOfStateLists& lol, W32 flags = 0);
StateList(const char* name, ListOfStateLists& lol, W32 flags = 0) {
init(name, lol, flags);
}
// simulated asymmetric c++ array constructor:
StateList& operator ()(const char* name, ListOfStateLists& lol, W32 flags = 0) {
init(name, lol, flags);
return *this;
}
void reset();
selfqueuelink* dequeue() {
if (empty())
return null;
count--;
assert(count >=0);
selfqueuelink* obj = removehead();
return obj;
}
selfqueuelink* enqueue(selfqueuelink* entry) {
entry->addtail(this);
count++;
return entry;
}
selfqueuelink* enqueue_after(selfqueuelink* entry, selfqueuelink* preventry) {
if (preventry) entry->addhead(preventry); else entry->addhead(this);
count++;
return entry;
}
selfqueuelink* remove(selfqueuelink* entry) {
assert(entry->linked());
entry->unlink();
count--;
assert(count >=0);
return entry;
}
selfqueuelink* peek() {
return (empty()) ? null : head();
}
void checkvalid();
};
template <typename T>
static void print_list_of_state_lists(ostream& os, const ListOfStateLists& lol, const char* title);
//
// Fetch Buffers
//
struct BranchPredictorUpdateInfo: public PredictorUpdate {
int stack_recover_idx;
int bptype;
W64 ripafter;
};
struct FetchBufferEntry: public TransOp {
RIPVirtPhys rip;
W64 uuid;
uopimpl_func_t synthop;
BranchPredictorUpdateInfo predinfo;
W16 index;
W8 threadid;
byte ld_st_truly_unaligned;
int init(int index) { this->index = index; return 0; }
void validate() { }
FetchBufferEntry() { }
FetchBufferEntry(const TransOp& transop) {
*((TransOp*)this) = transop;
}
};
//
// ReorderBufferEntry
struct ThreadContext;
struct OutOfOrderCore;
struct PhysicalRegister;
struct LoadStoreQueueEntry;
struct OutOfOrderCoreEvent;
//
// Reorder Buffer (ROB) structure, used for tracking all uops in flight.
// This same structure is used to represent both dispatched but not yet issued
// uops as well as issued uops.
//
struct ReorderBufferEntry: public selfqueuelink {
FetchBufferEntry uop;
struct StateList* current_state_list;
PhysicalRegister* physreg;
PhysicalRegister* operands[MAX_OPERANDS];
LoadStoreQueueEntry* lsq;
W16s idx;
W16s cycles_left; // execution latency counter, decremented every cycle when executing
W16s forward_cycle; // forwarding cycle after completion
W16s lfrqslot;
W16s iqslot;
W16 executable_on_cluster_mask;
W8s cluster;
W8 coreid;
W8 threadid;
byte fu;
byte consumer_count;
PTEUpdate pteupdate;
Waddr origvirt; // original virtual address, with low bits
Waddr virtpage; // virtual page number actually accessed by the load or store
byte entry_valid:1, load_store_second_phase:1, all_consumers_off_bypass:1, dest_renamed_before_writeback:1, no_branches_between_renamings:1, transient:1, lock_acquired:1, issued:1;
byte tlb_walk_level;
int index() const { return idx; }
void validate() { entry_valid = true; }
void changestate(StateList& newqueue, bool place_at_head = false, ReorderBufferEntry* prevrob = null) {
if (current_state_list)
current_state_list->remove(this);
current_state_list = &newqueue;
if (place_at_head) newqueue.enqueue_after(this, prevrob); else newqueue.enqueue(this);
}
void init(int idx);
void reset();
bool ready_to_issue() const;
bool ready_to_commit() const;
StateList& get_ready_to_issue_list() const;
bool find_sources();
int forward();
int select_cluster();
int issue();
Waddr addrgen(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& virtpage, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate, Waddr& addr, int& exception, PageFaultErrorCode& pfec, bool& annul);
bool handle_common_load_store_exceptions(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& addr, int& exception, PageFaultErrorCode& pfec);
int issuestore(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, bool rcready, PTEUpdate& pteupdate);
int issueload(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate);
void issueprefetch(IssueState& state, W64 ra, W64 rb, W64 rc, int cachelevel);
int probecache(Waddr addr, LoadStoreQueueEntry* sfra);
void tlbwalk();
int issuefence(LoadStoreQueueEntry& state);
void release();
W64 annul(bool keep_misspec_uop, bool return_first_annulled_rip = false);
W64 annul_after() { return annul(true); }
W64 annul_after_and_including() { return annul(false); }
int commit();
void replay();
void replay_locked();
int pseudocommit();
void redispatch(const bitvec<MAX_OPERANDS>& dependent_operands, ReorderBufferEntry* prevrob);
void redispatch_dependents(bool inclusive = true);
void loadwakeup();
void fencewakeup();
LoadStoreQueueEntry* find_nearest_memory_fence();
bool release_mem_lock(bool forced = false);
ostream& print(ostream& os) const;
stringbuf& get_operand_info(stringbuf& sb, int operand) const;
ostream& print_operand_info(ostream& os, int operand) const;
OutOfOrderCore& getcore() const { return coreof(coreid); }
ThreadContext& getthread() const;
issueq_tag_t get_tag();
};
void decode_tag(issueq_tag_t tag, int& threadid, int& idx) {
threadid = tag >> MAX_ROB_IDX_BIT;
int mask = ((1 << (MAX_ROB_IDX_BIT + MAX_THREADS_BIT)) - 1) >> MAX_THREADS_BIT;
idx = tag & mask;
}
static inline ostream& operator <<(ostream& os, const ReorderBufferEntry& rob) {
return rob.print(os);
}
//
// Load/Store Queue
//
#define LSQ_SIZE (LDQ_SIZE + STQ_SIZE)
// Define this to allow speculative issue of loads before unresolved stores
#define SMT_ENABLE_LOAD_HOISTING
struct LoadStoreQueueEntry: public SFR {
ReorderBufferEntry* rob;
W16 idx;
byte coreid;
W8s mbtag;
W8 store:1, lfence:1, sfence:1, entry_valid:1;
W32 padding;
LoadStoreQueueEntry() { }
int index() const { return idx; }
void reset() {
int oldidx = idx;
setzero(*this);
idx = oldidx;
mbtag = -1;
}
void init(int idx) {
this->idx = idx;
reset();
}
void validate() { entry_valid = 1; }
ostream& print(ostream& os) const;
LoadStoreQueueEntry& operator =(const SFR& sfr) {
*((SFR*)this) = sfr;
return *this;
}
OutOfOrderCore& getcore() const { return coreof(coreid); }
};
static inline ostream& operator <<(ostream& os, const LoadStoreQueueEntry& lsq) {
return lsq.print(os);
}
struct PhysicalRegisterOperandInfo {
W32 uuid;
W16 physreg;
W16 rob;
byte state;
byte rfid;
byte archreg;
byte pad1;
};
ostream& operator <<(ostream& os, const PhysicalRegisterOperandInfo& opinfo);
//
// Physical Register File
//
struct PhysicalRegister: public selfqueuelink {
ReorderBufferEntry* rob;
W64 data;
W16 flags;
W16 idx;
W8 coreid;
W8 rfid;
W8 state;
W8 archreg;
W8 all_consumers_sourced_from_bypass:1;
W16s refcount;
W8 threadid;
StateList& get_state_list(int state) const;
StateList& get_state_list() const { return get_state_list(this->state); }
void changestate(int newstate) {
if likely (state != PHYSREG_NONE) get_state_list(state).remove(this);
state = newstate;
get_state_list(state).enqueue(this);
}
void init(int coreid, int rfid, int idx) {
this->coreid = coreid;
this->rfid = rfid;
this->idx = idx;
reset();
}
private:
void addref() { refcount++; }
void unref() {
refcount--;
assert((idx == 0) || (refcount >= 0));
}
public:
void addref(const ReorderBufferEntry& rob, W8 threadid) { addref(); }
void unref(const ReorderBufferEntry& rob, W8 threadid) { unref(); }
void addspecref(int archreg, W8 threadid) { addref(); }
void unspecref(int archreg, W8 threadid) { unref(); }
void addcommitref(int archreg, W8 threadid) { addref(); }
void uncommitref(int archreg, W8 threadid) { unref(); }
bool referenced() const { return (refcount > 0); }
bool nonnull() const { return (index() != PHYS_REG_NULL); }
bool allocated() const { return (state != PHYSREG_FREE); }
void commit() { changestate(PHYSREG_ARCH); }
void complete() { changestate(PHYSREG_BYPASS); }
void writeback() { changestate(PHYSREG_WRITTEN); }
void free() {
changestate(PHYSREG_FREE);
rob = 0;
refcount = 0;
threadid = 0xff;
all_consumers_sourced_from_bypass = 1;
}
private:
void reset() {
selfqueuelink::reset();
state = PHYSREG_NONE;
free();
}
public:
void reset(W8 threadid, bool check_id = true) {
if (check_id && this->threadid != threadid) return;
if (!check_id) {
selfqueuelink::reset();
state = PHYSREG_NONE;
}
free();
}
int index() const { return idx; }
bool valid() const { return ((flags & FLAG_INV) == 0); }
bool ready() const { return ((flags & FLAG_WAIT) == 0); }
void fill_operand_info(PhysicalRegisterOperandInfo& opinfo);
OutOfOrderCore& getcore() const { return coreof(coreid); }
};
ostream& operator <<(ostream& os, const PhysicalRegister& physreg);
struct PhysicalRegisterFile: public array<PhysicalRegister, MAX_PHYS_REG_FILE_SIZE> {
byte coreid;
byte rfid;
W16 size;
const char* name;
StateList states[MAX_PHYSREG_STATE];
W64 allocations;
W64 frees;
PhysicalRegisterFile() { }
PhysicalRegisterFile(const char* name, int coreid, int rfid, int size) {
init(name, coreid, rfid, size); reset();
}
PhysicalRegisterFile& operator ()(const char* name, int coreid, int rfid, int size) {
init(name, coreid, rfid, size); reset(); return *this;
}
void init(const char* name, int coreid, int rfid, int size);
bool remaining() const { return (!states[PHYSREG_FREE].empty()); }
PhysicalRegister* alloc(W8 threadid, int r = -1);
void reset(W8 threadid);
ostream& print(ostream& os) const;
OutOfOrderCore& getcore() const { return coreof(coreid); }
private:
void reset();
};
static inline ostream& operator <<(ostream& os, const PhysicalRegisterFile& physregs) {
return physregs.print(os);
}
//
// Register Rename Table
//
struct RegisterRenameTable: public array<PhysicalRegister*, TRANSREG_COUNT> {
#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
bitvec<TRANSREG_COUNT> renamed_in_this_basic_block;
#endif
ostream& print(ostream& os) const;
};
static inline ostream& operator <<(ostream& os, const RegisterRenameTable& rrt) {
return rrt.print(os);
}
enum {
ISSUE_COMPLETED = 1, // issued correctly
ISSUE_NEEDS_REPLAY = 0, // fast scheduling replay
ISSUE_MISSPECULATED = -1, // mis-speculation: redispatch dependent slice
ISSUE_NEEDS_REFETCH = -2, // refetch from RIP of bad insn
};
enum {
COMMIT_RESULT_NONE = 0, // no instructions committed: some uops not ready
COMMIT_RESULT_OK = 1, // committed
COMMIT_RESULT_EXCEPTION = 2, // exception
COMMIT_RESULT_BARRIER = 3,// barrier; branch to microcode (brp uop)
COMMIT_RESULT_SMC = 4, // self modifying code detected
COMMIT_RESULT_INTERRUPT = 5, // interrupt pending
COMMIT_RESULT_STOP = 6 // stop processor model (shutdown)
};
// Branch predictor outcomes:
enum { MISPRED = 0, CORRECT = 1 };
//
// Lookup tables (LUTs):
//
struct Cluster {
const char* name;
W16 issue_width;
W32 fu_mask;
};
extern const Cluster clusters[MAX_CLUSTERS];
extern byte uop_executable_on_cluster[OP_MAX_OPCODE];
extern W32 forward_at_cycle_lut[MAX_CLUSTERS][MAX_FORWARDING_LATENCY+1];
extern const byte archdest_can_commit[TRANSREG_COUNT];
extern const byte archdest_is_visible[TRANSREG_COUNT];
struct OutOfOrderMachine;
struct OutOfOrderCoreCacheCallbacks: public CacheSubsystem::PerCoreCacheCallbacks {
OutOfOrderCore& core;
OutOfOrderCoreCacheCallbacks(OutOfOrderCore& core_): core(core_) { }
virtual void dcache_wakeup(LoadStoreInfo lsi, W64 physaddr);
virtual void icache_wakeup(LoadStoreInfo lsi, W64 physaddr);
};
struct MemoryInterlockEntry {
W64 uuid;
W16 rob;
byte vcpuid;
W8 threadid;
void reset() { uuid = 0; rob = 0; vcpuid = 0; threadid = 0;}
ostream& print(ostream& os, W64 physaddr) const {
os << "phys ", (void*)physaddr, ": vcpu ", vcpuid, ", threadid ", threadid, ", uuid ", uuid, ", rob ", rob;
return os;