src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 from nmigen.cli import main
  11 from nmutil.iocontrol import RecordObject
  12 from nmutil.util import wrap
  13 from nmigen.utils import log2_int
  14 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  15                                      DCacheToLoadStore1Type,
  16                                      MMUToDCacheType,
  17                                      DCacheToMMUType)
  18
  19 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  20                                 WBAddrType, WBDataType, WBSelType,
  21                                 WBMasterOut, WBSlaveOut,
  22                                 WBMasterOutVector, WBSlaveOutVector,
  23                                 WBIOMasterOut, WBIOSlaveOut)
  24
  25 from soc.experiment.cache_ram import CacheRam
  26 from soc.experiment.plru import PLRU
  27
  28 # for test
  29 from nmigen_soc.wishbone.sram import SRAM
  30 from nmigen import Memory
  31 from nmigen.cli import rtlil
  32 if True:
  33     from nmigen.back.pysim import Simulator, Delay, Settle
  34 else:
  35     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  36
  37
  38 # TODO: make these parameters of DCache at some point
  39 LINE_SIZE = 64    # Line size in bytes
  40 NUM_LINES = 16    # Number of lines in a set
  41 NUM_WAYS = 4      # Number of ways
  42 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  43 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  44 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  45 LOG_LENGTH = 0    # Non-zero to enable log data collection
  46
  47 # BRAM organisation: We never access more than
  48 #     -- WB_DATA_BITS at a time so to save
  49 #     -- resources we make the array only that wide, and
  50 #     -- use consecutive indices for to make a cache "line"
  51 #     --
  52 #     -- ROW_SIZE is the width in bytes of the BRAM
  53 #     -- (based on WB, so 64-bits)
  54 ROW_SIZE = WB_DATA_BITS // 8;
  55
  56 # ROW_PER_LINE is the number of row (wishbone
  57 # transactions) in a line
  58 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  59
  60 # BRAM_ROWS is the number of rows in BRAM needed
  61 # to represent the full dcache
  62 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  63
  64
  65 # Bit fields counts in the address
  66
  67 # REAL_ADDR_BITS is the number of real address
  68 # bits that we store
  69 REAL_ADDR_BITS = 56
  70
  71 # ROW_BITS is the number of bits to select a row
  72 ROW_BITS = log2_int(BRAM_ROWS)
  73
  74 # ROW_LINE_BITS is the number of bits to select
  75 # a row within a line
  76 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  77
  78 # LINE_OFF_BITS is the number of bits for
  79 # the offset in a cache line
  80 LINE_OFF_BITS = log2_int(LINE_SIZE)
  81
  82 # ROW_OFF_BITS is the number of bits for
  83 # the offset in a row
  84 ROW_OFF_BITS = log2_int(ROW_SIZE)
  85
  86 # INDEX_BITS is the number if bits to
  87 # select a cache line
  88 INDEX_BITS = log2_int(NUM_LINES)
  89
  90 # SET_SIZE_BITS is the log base 2 of the set size
  91 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
  92
  93 # TAG_BITS is the number of bits of
  94 # the tag part of the address
  95 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
  96
  97 # TAG_WIDTH is the width in bits of each way of the tag RAM
  98 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
  99
 100 # WAY_BITS is the number of bits to select a way
 101 WAY_BITS = log2_int(NUM_WAYS)
 102
 103 # Example of layout for 32 lines of 64 bytes:
 104 #
 105 # ..  tag    |index|  line  |
 106 # ..         |   row   |    |
 107 # ..         |     |---|    | ROW_LINE_BITS  (3)
 108 # ..         |     |--- - --| LINE_OFF_BITS (6)
 109 # ..         |         |- --| ROW_OFF_BITS  (3)
 110 # ..         |----- ---|    | ROW_BITS      (8)
 111 # ..         |-----|        | INDEX_BITS    (5)
 112 # .. --------|              | TAG_BITS      (45)
 113
 114 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 115
 116 def CacheTagArray():
 117     return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
 118
 119 def CacheValidBitsArray():
 120     return Array(Signal(INDEX_BITS) for x in range(NUM_LINES))
 121
 122 def RowPerLineValidArray():
 123     return Array(Signal() for x in range(ROW_PER_LINE))
 124
 125 # L1 TLB
 126 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 127 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 128 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 129 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 130 TLB_PTE_BITS     = 64
 131 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 132
 133 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 134 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
 135 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
 136 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
 137 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 138 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 139         "geometry bits don't add up"
 140 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 141         "geometry bits don't add up"
 142 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 143          "geometry bits don't add up"
 144 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 145 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 146
 147
 148 def TLBValidBitsArray():
 149     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 150
 151 def TLBTagEAArray():
 152     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 153
 154 def TLBTagsArray():
 155     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 156
 157 def TLBPtesArray():
 158     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 159
 160 def HitWaySet():
 161     return Array(Signal(WAY_BITS) for x in range(TLB_NUM_WAYS))
 162
 163 # Cache RAM interface
 164 def CacheRamOut():
 165     return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
 166
 167 # PLRU output interface
 168 def PLRUOut():
 169     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 170
 171 # TLB PLRU output interface
 172 def TLBPLRUOut():
 173     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 174
 175 # Helper functions to decode incoming requests
 176 #
 177 # Return the cache line index (tag index) for an address
 178 def get_index(addr):
 179     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 180
 181 # Return the cache row index (data memory) for an address
 182 def get_row(addr):
 183     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 184
 185 # Return the index of a row within a line
 186 def get_row_of_line(row):
 187     return row[:ROW_LINE_BITS]
 188
 189 # Returns whether this is the last row of a line
 190 def is_last_row_addr(addr, last):
 191     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 192
 193 # Returns whether this is the last row of a line
 194 def is_last_row(row, last):
 195     return get_row_of_line(row) == last
 196
 197 # Return the next row in the current cache line. We use a
 198 # dedicated function in order to limit the size of the
 199 # generated adder to be only the bits within a cache line
 200 # (3 bits with default settings)
 201 def next_row(row):
 202     row_v = row[0:ROW_LINE_BITS] + 1
 203     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 204
 205 # Get the tag value from the address
 206 def get_tag(addr):
 207     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 208
 209 # Read a tag from a tag memory row
 210 def read_tag(way, tagset):
 211     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 212
 213 # Read a TLB tag from a TLB tag memory row
 214 def read_tlb_tag(way, tags):
 215     return tags.word_select(way, TLB_EA_TAG_BITS)
 216
 217 # Write a TLB tag to a TLB tag memory row
 218 def write_tlb_tag(way, tags, tag):
 219     return read_tlb_tag(way, tags).eq(tag)
 220
 221 # Read a PTE from a TLB PTE memory row
 222 def read_tlb_pte(way, ptes):
 223     return ptes.word_select(way, TLB_PTE_BITS)
 224
 225 def write_tlb_pte(way, ptes, newpte):
 226     return read_tlb_pte(way, ptes).eq(newpte)
 227
 228
 229 # Record for storing permission, attribute, etc. bits from a PTE
 230 class PermAttr(RecordObject):
 231     def __init__(self):
 232         super().__init__()
 233         self.reference = Signal()
 234         self.changed   = Signal()
 235         self.nocache   = Signal()
 236         self.priv      = Signal()
 237         self.rd_perm   = Signal()
 238         self.wr_perm   = Signal()
 239
 240
 241 def extract_perm_attr(pte):
 242     pa = PermAttr()
 243     pa.reference = pte[8]
 244     pa.changed   = pte[7]
 245     pa.nocache   = pte[5]
 246     pa.priv      = pte[3]
 247     pa.rd_perm   = pte[2]
 248     pa.wr_perm   = pte[1]
 249     return pa;
 250
 251
 252 # Type of operation on a "valid" input
 253 @unique
 254 class Op(Enum):
 255     OP_NONE       = 0
 256     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 257     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 258     OP_LOAD_HIT   = 3 # Cache hit on load
 259     OP_LOAD_MISS  = 4 # Load missing cache
 260     OP_LOAD_NC    = 5 # Non-cachable load
 261     OP_STORE_HIT  = 6 # Store hitting cache
 262     OP_STORE_MISS = 7 # Store missing cache
 263
 264
 265 # Cache state machine
 266 @unique
 267 class State(Enum):
 268     IDLE             = 0 # Normal load hit processing
 269     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 270     STORE_WAIT_ACK   = 2 # Store wait ack
 271     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 272
 273
 274 # Dcache operations:
 275 #
 276 # In order to make timing, we use the BRAMs with
 277 # an output buffer, which means that the BRAM
 278 # output is delayed by an extra cycle.
 279 #
 280 # Thus, the dcache has a 2-stage internal pipeline
 281 # for cache hits with no stalls.
 282 #
 283 # All other operations are handled via stalling
 284 # in the first stage.
 285 #
 286 # The second stage can thus complete a hit at the same
 287 # time as the first stage emits a stall for a complex op.
 288 #
 289 # Stage 0 register, basically contains just the latched request
 290
 291 class RegStage0(RecordObject):
 292     def __init__(self, name=None):
 293         super().__init__(name=name)
 294         self.req     = LoadStore1ToDCacheType(name="lsmem")
 295         self.tlbie   = Signal()
 296         self.doall   = Signal()
 297         self.tlbld   = Signal()
 298         self.mmu_req = Signal() # indicates source of request
 299
 300
 301 class MemAccessRequest(RecordObject):
 302     def __init__(self, name=None):
 303         super().__init__(name=name)
 304         self.op        = Signal(Op)
 305         self.valid     = Signal()
 306         self.dcbz      = Signal()
 307         self.real_addr = Signal(REAL_ADDR_BITS)
 308         self.data      = Signal(64)
 309         self.byte_sel  = Signal(8)
 310         self.hit_way   = Signal(WAY_BITS)
 311         self.same_tag  = Signal()
 312         self.mmu_req   = Signal()
 313
 314
 315 # First stage register, contains state for stage 1 of load hits
 316 # and for the state machine used by all other operations
 317 class RegStage1(RecordObject):
 318     def __init__(self, name=None):
 319         super().__init__(name=name)
 320         # Info about the request
 321         self.full             = Signal() # have uncompleted request
 322         self.mmu_req          = Signal() # request is from MMU
 323         self.req              = MemAccessRequest(name="reqmem")
 324
 325         # Cache hit state
 326         self.hit_way          = Signal(WAY_BITS)
 327         self.hit_load_valid   = Signal()
 328         self.hit_index        = Signal(INDEX_BITS)
 329         self.cache_hit        = Signal()
 330
 331         # TLB hit state
 332         self.tlb_hit          = Signal()
 333         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 334         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 335
 336         # 2-stage data buffer for data forwarded from writes to reads
 337         self.forward_data1    = Signal(64)
 338         self.forward_data2    = Signal(64)
 339         self.forward_sel1     = Signal(8)
 340         self.forward_valid1   = Signal()
 341         self.forward_way1     = Signal(WAY_BITS)
 342         self.forward_row1     = Signal(ROW_BITS)
 343         self.use_forward1     = Signal()
 344         self.forward_sel      = Signal(8)
 345
 346         # Cache miss state (reload state machine)
 347         self.state            = Signal(State)
 348         self.dcbz             = Signal()
 349         self.write_bram       = Signal()
 350         self.write_tag        = Signal()
 351         self.slow_valid       = Signal()
 352         self.wb               = WBMasterOut()
 353         self.reload_tag       = Signal(TAG_BITS)
 354         self.store_way        = Signal(WAY_BITS)
 355         self.store_row        = Signal(ROW_BITS)
 356         self.store_index      = Signal(INDEX_BITS)
 357         self.end_row_ix       = Signal(log2_int(ROW_LINE_BITS, False))
 358         self.rows_valid       = RowPerLineValidArray()
 359         self.acks_pending     = Signal(3)
 360         self.inc_acks         = Signal()
 361         self.dec_acks         = Signal()
 362
 363         # Signals to complete (possibly with error)
 364         self.ls_valid         = Signal()
 365         self.ls_error         = Signal()
 366         self.mmu_done         = Signal()
 367         self.mmu_error        = Signal()
 368         self.cache_paradox    = Signal()
 369
 370         # Signal to complete a failed stcx.
 371         self.stcx_fail        = Signal()
 372
 373
 374 # Reservation information
 375 class Reservation(RecordObject):
 376     def __init__(self):
 377         super().__init__()
 378         self.valid = Signal()
 379         self.addr  = Signal(64-LINE_OFF_BITS)
 380
 381
 382 class DTLBUpdate(Elaboratable):
 383     def __init__(self):
 384         self.tlbie    = Signal()
 385         self.tlbwe    = Signal()
 386         self.doall    = Signal()
 387         self.updated  = Signal()
 388         self.v_updated  = Signal()
 389         self.tlb_hit    = Signal()
 390         self.tlb_req_index = Signal(TLB_SET_BITS)
 391
 392         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 393         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 394         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 395         self.repl_way        = Signal(TLB_WAY_BITS)
 396         self.eatag           = Signal(TLB_EA_TAG_BITS)
 397         self.pte_data        = Signal(TLB_PTE_BITS)
 398
 399         self.dv = Signal(TLB_PTE_WAY_BITS)
 400
 401         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 402         self.pb_out = Signal(TLB_NUM_WAYS)
 403         self.db_out = Signal(TLB_PTE_WAY_BITS)
 404
 405     def elaborate(self, platform):
 406         m = Module()
 407         comb = m.d.comb
 408         sync = m.d.sync
 409
 410         tagset   = Signal(TLB_TAG_WAY_BITS)
 411         pteset   = Signal(TLB_PTE_WAY_BITS)
 412
 413         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 414
 415         with m.If(self.tlbie & self.doall):
 416             pass # clear all back in parent
 417         with m.Elif(self.tlbie):
 418             with m.If(self.tlb_hit):
 419                 comb += db_out.eq(self.dv)
 420                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 421                 comb += self.v_updated.eq(1)
 422
 423         with m.Elif(self.tlbwe):
 424
 425             comb += tagset.eq(self.tlb_tag_way)
 426             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 427             comb += tb_out.eq(tagset)
 428
 429             comb += pteset.eq(self.tlb_pte_way)
 430             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 431             comb += pb_out.eq(pteset)
 432
 433             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 434
 435             comb += self.updated.eq(1)
 436             comb += self.v_updated.eq(1)
 437
 438         return m
 439
 440     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 441                        r0_valid, r1, cache_valid_bits, replace_way,
 442                        use_forward1_next, use_forward2_next,
 443                        req_hit_way, plru_victim, rc_ok, perm_attr,
 444                        valid_ra, perm_ok, access_ok, req_op, req_go,
 445                        tlb_pte_way,
 446                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 447                        cancel_store, req_same_tag, r0_stall, early_req_row):
 448         """Cache request parsing and hit detection
 449         """
 450
 451 class DCachePendingHit(Elaboratable):
 452
 453     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 454                       cache_valid_idx, cache_tag_set,
 455                     req_addr,
 456                     hit_set):
 457
 458         self.go          = Signal()
 459         self.virt_mode   = Signal()
 460         self.is_hit      = Signal()
 461         self.tlb_hit     = Signal()
 462         self.hit_way     = Signal(WAY_BITS)
 463         self.rel_match   = Signal()
 464         self.req_index   = Signal(INDEX_BITS)
 465         self.reload_tag  = Signal(TAG_BITS)
 466
 467         self.tlb_hit_way = tlb_hit_way
 468         self.tlb_pte_way = tlb_pte_way
 469         self.tlb_valid_way = tlb_valid_way
 470         self.cache_valid_idx = cache_valid_idx
 471         self.cache_tag_set = cache_tag_set
 472         self.req_addr = req_addr
 473         self.hit_set = hit_set
 474
 475     def elaborate(self, platform):
 476         m = Module()
 477         comb = m.d.comb
 478         sync = m.d.sync
 479
 480         go = self.go
 481         virt_mode = self.virt_mode
 482         is_hit = self.is_hit
 483         tlb_pte_way = self.tlb_pte_way
 484         tlb_valid_way = self.tlb_valid_way
 485         cache_valid_idx = self.cache_valid_idx
 486         cache_tag_set = self.cache_tag_set
 487         req_addr = self.req_addr
 488         tlb_hit_way = self.tlb_hit_way
 489         tlb_hit = self.tlb_hit
 490         hit_set = self.hit_set
 491         hit_way = self.hit_way
 492         rel_match = self.rel_match
 493         req_index = self.req_index
 494         reload_tag = self.reload_tag
 495
 496         rel_matches = Array(Signal() for i in range(TLB_NUM_WAYS))
 497         hit_way_set = HitWaySet()
 498
 499         # Test if pending request is a hit on any way
 500         # In order to make timing in virtual mode,
 501         # when we are using the TLB, we compare each
 502         # way with each of the real addresses from each way of
 503         # the TLB, and then decide later which match to use.
 504
 505         with m.If(virt_mode):
 506             for j in range(TLB_NUM_WAYS):
 507                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 508                 s_hit       = Signal()
 509                 s_pte       = Signal(TLB_PTE_BITS)
 510                 s_ra        = Signal(REAL_ADDR_BITS)
 511                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 512                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 513                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 514                 comb += s_tag.eq(get_tag(s_ra))
 515
 516                 for i in range(NUM_WAYS):
 517                     is_tag_hit = Signal()
 518                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 519                                   (read_tag(i, cache_tag_set) == s_tag)
 520                                   & tlb_valid_way[j])
 521                     with m.If(is_tag_hit):
 522                         comb += hit_way_set[j].eq(i)
 523                         comb += s_hit.eq(1)
 524                 comb += hit_set[j].eq(s_hit)
 525                 with m.If(s_tag == reload_tag):
 526                     comb += rel_matches[j].eq(1)
 527             with m.If(tlb_hit):
 528                 comb += is_hit.eq(hit_set[tlb_hit_way])
 529                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 530                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 531         with m.Else():
 532             s_tag       = Signal(TAG_BITS)
 533             comb += s_tag.eq(get_tag(req_addr))
 534             for i in range(NUM_WAYS):
 535                 is_tag_hit = Signal()
 536                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 537                           read_tag(i, cache_tag_set) == s_tag)
 538                 with m.If(is_tag_hit):
 539                     comb += hit_way.eq(i)
 540                     comb += is_hit.eq(1)
 541             with m.If(s_tag == reload_tag):
 542                 comb += rel_match.eq(1)
 543
 544         return m
 545
 546
 547 class DCache(Elaboratable):
 548     """Set associative dcache write-through
 549     TODO (in no specific order):
 550     * See list in icache.vhdl
 551     * Complete load misses on the cycle when WB data comes instead of
 552       at the end of line (this requires dealing with requests coming in
 553       while not idle...)
 554     """
 555     def __init__(self):
 556         self.d_in      = LoadStore1ToDCacheType("d_in")
 557         self.d_out     = DCacheToLoadStore1Type("d_out")
 558
 559         self.m_in      = MMUToDCacheType("m_in")
 560         self.m_out     = DCacheToMMUType("m_out")
 561
 562         self.stall_out = Signal()
 563
 564         self.wb_out    = WBMasterOut()
 565         self.wb_in     = WBSlaveOut()
 566
 567         self.log_out   = Signal(20)
 568
 569     def stage_0(self, m, r0, r1, r0_full):
 570         """Latch the request in r0.req as long as we're not stalling
 571         """
 572         comb = m.d.comb
 573         sync = m.d.sync
 574         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 575
 576         r = RegStage0("stage0")
 577
 578         # TODO, this goes in unit tests and formal proofs
 579         with m.If(~(d_in.valid & m_in.valid)):
 580             #sync += Display("request collision loadstore vs MMU")
 581             pass
 582
 583         with m.If(m_in.valid):
 584             sync += r.req.valid.eq(1)
 585             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 586             sync += r.req.dcbz.eq(0)
 587             sync += r.req.nc.eq(0)
 588             sync += r.req.reserve.eq(0)
 589             sync += r.req.virt_mode.eq(1)
 590             sync += r.req.priv_mode.eq(1)
 591             sync += r.req.addr.eq(m_in.addr)
 592             sync += r.req.data.eq(m_in.pte)
 593             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 594             sync += r.tlbie.eq(m_in.tlbie)
 595             sync += r.doall.eq(m_in.doall)
 596             sync += r.tlbld.eq(m_in.tlbld)
 597             sync += r.mmu_req.eq(1)
 598         with m.Else():
 599             sync += r.req.eq(d_in)
 600             sync += r.tlbie.eq(0)
 601             sync += r.doall.eq(0)
 602             sync += r.tlbld.eq(0)
 603             sync += r.mmu_req.eq(0)
 604             with m.If(~(r1.full & r0_full)):
 605                 sync += r0.eq(r)
 606                 sync += r0_full.eq(r.req.valid)
 607
 608     def tlb_read(self, m, r0_stall, tlb_valid_way,
 609                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 610                  dtlb_tags, dtlb_ptes):
 611         """TLB
 612         Operates in the second cycle on the request latched in r0.req.
 613         TLB updates write the entry at the end of the second cycle.
 614         """
 615         comb = m.d.comb
 616         sync = m.d.sync
 617         m_in, d_in = self.m_in, self.d_in
 618
 619         index    = Signal(TLB_SET_BITS)
 620         addrbits = Signal(TLB_SET_BITS)
 621
 622         amin = TLB_LG_PGSZ
 623         amax = TLB_LG_PGSZ + TLB_SET_BITS
 624
 625         with m.If(m_in.valid):
 626             comb += addrbits.eq(m_in.addr[amin : amax])
 627         with m.Else():
 628             comb += addrbits.eq(d_in.addr[amin : amax])
 629         comb += index.eq(addrbits)
 630
 631         # If we have any op and the previous op isn't finished,
 632         # then keep the same output for next cycle.
 633         with m.If(~r0_stall):
 634             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 635             sync += tlb_tag_way.eq(dtlb_tags[index])
 636             sync += tlb_pte_way.eq(dtlb_ptes[index])
 637
 638     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 639         """Generate TLB PLRUs
 640         """
 641         comb = m.d.comb
 642         sync = m.d.sync
 643
 644         if TLB_NUM_WAYS == 0:
 645             return
 646         for i in range(TLB_SET_SIZE):
 647             # TLB PLRU interface
 648             tlb_plru        = PLRU(WAY_BITS)
 649             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 650             tlb_plru_acc_en = Signal()
 651
 652             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 653             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 654             comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 655             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 656
 657     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 658                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 659                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 660
 661         comb = m.d.comb
 662         sync = m.d.sync
 663
 664         hitway = Signal(TLB_WAY_BITS)
 665         hit    = Signal()
 666         eatag  = Signal(TLB_EA_TAG_BITS)
 667
 668         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 669         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 670         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 671
 672         for i in range(TLB_NUM_WAYS):
 673             is_tag_hit = Signal()
 674             comb += is_tag_hit.eq(tlb_valid_way[i]
 675                                   & read_tlb_tag(i, tlb_tag_way) == eatag)
 676             with m.If(is_tag_hit):
 677                 comb += hitway.eq(i)
 678                 comb += hit.eq(1)
 679
 680         comb += tlb_hit.eq(hit & r0_valid)
 681         comb += tlb_hit_way.eq(hitway)
 682
 683         with m.If(tlb_hit):
 684             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 685         with m.Else():
 686             comb += pte.eq(0)
 687         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 688         with m.If(r0.req.virt_mode):
 689             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 690                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 691                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 692             comb += perm_attr.eq(extract_perm_attr(pte))
 693         with m.Else():
 694             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 695                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 696
 697             comb += perm_attr.reference.eq(1)
 698             comb += perm_attr.changed.eq(1)
 699             comb += perm_attr.priv.eq(1)
 700             comb += perm_attr.nocache.eq(0)
 701             comb += perm_attr.rd_perm.eq(1)
 702             comb += perm_attr.wr_perm.eq(1)
 703
 704     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 705                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 706                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 707
 708         comb = m.d.comb
 709         sync = m.d.sync
 710
 711         tlbie    = Signal()
 712         tlbwe    = Signal()
 713
 714         comb += tlbie.eq(r0_valid & r0.tlbie)
 715         comb += tlbwe.eq(r0_valid & r0.tlbld)
 716
 717         m.submodules.tlb_update = d = DTLBUpdate()
 718         with m.If(tlbie & r0.doall):
 719             # clear all valid bits at once
 720             for i in range(TLB_SET_SIZE):
 721                 sync += dtlb_valid_bits[i].eq(0)
 722         with m.If(d.updated):
 723             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 724             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 725         with m.If(d.v_updated):
 726             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 727
 728         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 729
 730         comb += d.tlbie.eq(tlbie)
 731         comb += d.tlbwe.eq(tlbwe)
 732         comb += d.doall.eq(r0.doall)
 733         comb += d.tlb_hit.eq(tlb_hit)
 734         comb += d.tlb_hit_way.eq(tlb_hit_way)
 735         comb += d.tlb_tag_way.eq(tlb_tag_way)
 736         comb += d.tlb_pte_way.eq(tlb_pte_way)
 737         comb += d.tlb_req_index.eq(tlb_req_index)
 738
 739         with m.If(tlb_hit):
 740             comb += d.repl_way.eq(tlb_hit_way)
 741         with m.Else():
 742             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 743         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 744         comb += d.pte_data.eq(r0.req.data)
 745
 746     def maybe_plrus(self, m, r1, plru_victim):
 747         """Generate PLRUs
 748         """
 749         comb = m.d.comb
 750         sync = m.d.sync
 751
 752         if TLB_NUM_WAYS == 0:
 753             return
 754
 755         for i in range(NUM_LINES):
 756             # PLRU interface
 757             plru        = PLRU(WAY_BITS)
 758             setattr(m.submodules, "plru%d" % i, plru)
 759             plru_acc_en = Signal()
 760
 761             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 762             comb += plru.acc_en.eq(plru_acc_en)
 763             comb += plru.acc.eq(r1.hit_way)
 764             comb += plru_victim[i].eq(plru.lru_o)
 765
 766     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 767         """Cache tag RAM read port
 768         """
 769         comb = m.d.comb
 770         sync = m.d.sync
 771         m_in, d_in = self.m_in, self.d_in
 772
 773         index = Signal(INDEX_BITS)
 774
 775         with m.If(r0_stall):
 776             comb += index.eq(req_index)
 777         with m.Elif(m_in.valid):
 778             comb += index.eq(get_index(m_in.addr))
 779         with m.Else():
 780             comb += index.eq(get_index(d_in.addr))
 781         sync += cache_tag_set.eq(cache_tags[index])
 782
 783     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 784                        r0_valid, r1, cache_valid_bits, replace_way,
 785                        use_forward1_next, use_forward2_next,
 786                        req_hit_way, plru_victim, rc_ok, perm_attr,
 787                        valid_ra, perm_ok, access_ok, req_op, req_go,
 788                        tlb_pte_way,
 789                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 790                        cancel_store, req_same_tag, r0_stall, early_req_row):
 791         """Cache request parsing and hit detection
 792         """
 793
 794         comb = m.d.comb
 795         sync = m.d.sync
 796         m_in, d_in = self.m_in, self.d_in
 797
 798         is_hit      = Signal()
 799         hit_way     = Signal(WAY_BITS)
 800         op          = Signal(Op)
 801         opsel       = Signal(3)
 802         go          = Signal()
 803         nc          = Signal()
 804         hit_set     = Array(Signal() for i in range(TLB_NUM_WAYS))
 805         cache_valid_idx = Signal(INDEX_BITS)
 806
 807         # Extract line, row and tag from request
 808         comb += req_index.eq(get_index(r0.req.addr))
 809         comb += req_row.eq(get_row(r0.req.addr))
 810         comb += req_tag.eq(get_tag(ra))
 811
 812         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 813         comb += cache_valid_idx.eq(cache_valid_bits[req_index])
 814
 815         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 816                                 tlb_valid_way, tlb_hit_way,
 817                                 cache_valid_idx, cache_tag_set,
 818                                 r0.req.addr,
 819                                 hit_set)
 820
 821         comb += dc.tlb_hit.eq(tlb_hit)
 822         comb += dc.reload_tag.eq(r1.reload_tag)
 823         comb += dc.virt_mode.eq(r0.req.virt_mode)
 824         comb += dc.go.eq(go)
 825         comb += dc.req_index.eq(req_index)
 826         comb += is_hit.eq(dc.is_hit)
 827         comb += hit_way.eq(dc.hit_way)
 828         comb += req_same_tag.eq(dc.rel_match)
 829
 830         # See if the request matches the line currently being reloaded
 831         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 832                   (req_index == r1.store_index) & req_same_tag):
 833             # For a store, consider this a hit even if the row isn't
 834             # valid since it will be by the time we perform the store.
 835             # For a load, check the appropriate row valid bit.
 836             valid = r1.rows_valid[req_row % ROW_PER_LINE]
 837             comb += is_hit.eq(~r0.req.load | valid)
 838             comb += hit_way.eq(replace_way)
 839
 840         # Whether to use forwarded data for a load or not
 841         comb += use_forward1_next.eq(0)
 842         with m.If((get_row(r1.req.real_addr) == req_row) &
 843                   (r1.req.hit_way == hit_way)):
 844             # Only need to consider r1.write_bram here, since if we
 845             # are writing refill data here, then we don't have a
 846             # cache hit this cycle on the line being refilled.
 847             # (There is the possibility that the load following the
 848             # load miss that started the refill could be to the old
 849             # contents of the victim line, since it is a couple of
 850             # cycles after the refill starts before we see the updated
 851             # cache tag. In that case we don't use the bypass.)
 852             comb += use_forward1_next.eq(r1.write_bram)
 853         comb += use_forward2_next.eq(0)
 854         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 855             comb += use_forward2_next.eq(r1.forward_valid1)
 856
 857         # The way that matched on a hit
 858         comb += req_hit_way.eq(hit_way)
 859
 860         # The way to replace on a miss
 861         with m.If(r1.write_tag):
 862             comb += replace_way.eq(plru_victim[r1.store_index])
 863         with m.Else():
 864             comb += replace_way.eq(r1.store_way)
 865
 866         # work out whether we have permission for this access
 867         # NB we don't yet implement AMR, thus no KUAP
 868         comb += rc_ok.eq(perm_attr.reference
 869                          & (r0.req.load | perm_attr.changed)
 870                 )
 871         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv)
 872                            & perm_attr.wr_perm
 873                            | (r0.req.load & perm_attr.rd_perm)
 874                           )
 875         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 876         # Combine the request and cache hit status to decide what
 877         # operation needs to be done
 878         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 879         comb += op.eq(Op.OP_NONE)
 880         with m.If(go):
 881             with m.If(~access_ok):
 882                 comb += op.eq(Op.OP_BAD)
 883             with m.Elif(cancel_store):
 884                 comb += op.eq(Op.OP_STCX_FAIL)
 885             with m.Else():
 886                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 887                 with m.Switch(opsel):
 888                     with m.Case(0b101):
 889                         comb += op.eq(Op.OP_LOAD_HIT)
 890                     with m.Case(0b100):
 891                         comb += op.eq(Op.OP_LOAD_MISS)
 892                     with m.Case(0b110):
 893                         comb += op.eq(Op.OP_LOAD_NC)
 894                     with m.Case(0b001):
 895                         comb += op.eq(Op.OP_STORE_HIT)
 896                     with m.Case(0b000):
 897                         comb += op.eq(Op.OP_STORE_MISS)
 898                     with m.Case(0b010):
 899                         comb += op.eq(Op.OP_STORE_MISS)
 900                     with m.Case(0b011):
 901                         comb += op.eq(Op.OP_BAD)
 902                     with m.Case(0b111):
 903                         comb += op.eq(Op.OP_BAD)
 904                     with m.Default():
 905                         comb += op.eq(Op.OP_NONE)
 906         comb += req_op.eq(op)
 907         comb += req_go.eq(go)
 908
 909         # Version of the row number that is valid one cycle earlier
 910         # in the cases where we need to read the cache data BRAM.
 911         # If we're stalling then we need to keep reading the last
 912         # row requested.
 913         with m.If(~r0_stall):
 914             with m.If(m_in.valid):
 915                 comb += early_req_row.eq(get_row(m_in.addr))
 916             with m.Else():
 917                 comb += early_req_row.eq(get_row(d_in.addr))
 918         with m.Else():
 919             comb += early_req_row.eq(req_row)
 920
 921     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 922                          r0_valid, r0, reservation):
 923         """Handle load-with-reservation and store-conditional instructions
 924         """
 925         comb = m.d.comb
 926         sync = m.d.sync
 927
 928         with m.If(r0_valid & r0.req.reserve):
 929
 930             # XXX generate alignment interrupt if address
 931             # is not aligned XXX or if r0.req.nc = '1'
 932             with m.If(r0.req.load):
 933                 comb += set_rsrv.eq(1) # load with reservation
 934             with m.Else():
 935                 comb += clear_rsrv.eq(1) # store conditional
 936                 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
 937                     comb += cancel_store.eq(1)
 938
 939     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 940                         reservation, r0):
 941
 942         comb = m.d.comb
 943         sync = m.d.sync
 944
 945         with m.If(r0_valid & access_ok):
 946             with m.If(clear_rsrv):
 947                 sync += reservation.valid.eq(0)
 948             with m.Elif(set_rsrv):
 949                 sync += reservation.valid.eq(1)
 950                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 951
 952     def writeback_control(self, m, r1, cache_out):
 953         """Return data for loads & completion control logic
 954         """
 955         comb = m.d.comb
 956         sync = m.d.sync
 957         d_out, m_out = self.d_out, self.m_out
 958
 959         data_out = Signal(64)
 960         data_fwd = Signal(64)
 961
 962         # Use the bypass if are reading the row that was
 963         # written 1 or 2 cycles ago, including for the
 964         # slow_valid = 1 case (i.e. completing a load
 965         # miss or a non-cacheable load).
 966         with m.If(r1.use_forward1):
 967             comb += data_fwd.eq(r1.forward_data1)
 968         with m.Else():
 969             comb += data_fwd.eq(r1.forward_data2)
 970
 971         comb += data_out.eq(cache_out[r1.hit_way])
 972
 973         for i in range(8):
 974             with m.If(r1.forward_sel[i]):
 975                 dsel = data_fwd.word_select(i, 8)
 976                 comb += data_out.word_select(i, 8).eq(dsel)
 977
 978         comb += d_out.valid.eq(r1.ls_valid)
 979         comb += d_out.data.eq(data_out)
 980         comb += d_out.store_done.eq(~r1.stcx_fail)
 981         comb += d_out.error.eq(r1.ls_error)
 982         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 983
 984         # Outputs to MMU
 985         comb += m_out.done.eq(r1.mmu_done)
 986         comb += m_out.err.eq(r1.mmu_error)
 987         comb += m_out.data.eq(data_out)
 988
 989         # We have a valid load or store hit or we just completed
 990         # a slow op such as a load miss, a NC load or a store
 991         #
 992         # Note: the load hit is delayed by one cycle. However it
 993         # can still not collide with r.slow_valid (well unless I
 994         # miscalculated) because slow_valid can only be set on a
 995         # subsequent request and not on its first cycle (the state
 996         # machine must have advanced), which makes slow_valid
 997         # at least 2 cycles from the previous hit_load_valid.
 998
 999         # Sanity: Only one of these must be set in any given cycle
1000
1001         if False: # TODO: need Display to get this to work
1002             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1003             "unexpected slow_valid collision with stcx_fail"
1004
1005             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1006              "unexpected hit_load_delayed collision with slow_valid"
1007
1008         with m.If(~r1.mmu_req):
1009             # Request came from loadstore1...
1010             # Load hit case is the standard path
1011             with m.If(r1.hit_load_valid):
1012                 #Display(f"completing load hit data={data_out}")
1013                 pass
1014
1015             # error cases complete without stalling
1016             with m.If(r1.ls_error):
1017                 # Display("completing ld/st with error")
1018                 pass
1019
1020             # Slow ops (load miss, NC, stores)
1021             with m.If(r1.slow_valid):
1022                 #Display(f"completing store or load miss data={data_out}")
1023                 pass
1024
1025         with m.Else():
1026             # Request came from MMU
1027             with m.If(r1.hit_load_valid):
1028                 # Display(f"completing load hit to MMU, data={m_out.data}")
1029                 pass
1030             # error cases complete without stalling
1031             with m.If(r1.mmu_error):
1032                 #Display("combpleting MMU ld with error")
1033                 pass
1034
1035             # Slow ops (i.e. load miss)
1036             with m.If(r1.slow_valid):
1037                 #Display("completing MMU load miss, data={m_out.data}")
1038                 pass
1039
1040     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1041         """rams
1042         Generate a cache RAM for each way. This handles the normal
1043         reads, writes from reloads and the special store-hit update
1044         path as well.
1045
1046         Note: the BRAMs have an extra read buffer, meaning the output
1047         is pipelined an extra cycle. This differs from the
1048         icache. The writeback logic needs to take that into
1049         account by using 1-cycle delayed signals for load hits.
1050         """
1051         comb = m.d.comb
1052         wb_in = self.wb_in
1053
1054         for i in range(NUM_WAYS):
1055             do_read  = Signal()
1056             rd_addr  = Signal(ROW_BITS)
1057             do_write = Signal()
1058             wr_addr  = Signal(ROW_BITS)
1059             wr_data  = Signal(WB_DATA_BITS)
1060             wr_sel   = Signal(ROW_SIZE)
1061             wr_sel_m = Signal(ROW_SIZE)
1062             _d_out   = Signal(WB_DATA_BITS)
1063
1064             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1065             setattr(m.submodules, "cacheram_%d" % i, way)
1066
1067             comb += way.rd_en.eq(do_read)
1068             comb += way.rd_addr.eq(rd_addr)
1069             comb += _d_out.eq(way.rd_data_o)
1070             comb += way.wr_sel.eq(wr_sel_m)
1071             comb += way.wr_addr.eq(wr_addr)
1072             comb += way.wr_data.eq(wr_data)
1073
1074             # Cache hit reads
1075             comb += do_read.eq(1)
1076             comb += rd_addr.eq(early_req_row)
1077             comb += cache_out[i].eq(_d_out)
1078
1079             # Write mux:
1080             #
1081             # Defaults to wishbone read responses (cache refill)
1082             #
1083             # For timing, the mux on wr_data/sel/addr is not
1084             # dependent on anything other than the current state.
1085
1086             with m.If(r1.write_bram):
1087                 # Write store data to BRAM.  This happens one
1088                 # cycle after the store is in r0.
1089                 comb += wr_data.eq(r1.req.data)
1090                 comb += wr_sel.eq(r1.req.byte_sel)
1091                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1092
1093                 with m.If(i == r1.req.hit_way):
1094                     comb += do_write.eq(1)
1095             with m.Else():
1096                 # Otherwise, we might be doing a reload or a DCBZ
1097                 with m.If(r1.dcbz):
1098                     comb += wr_data.eq(0)
1099                 with m.Else():
1100                     comb += wr_data.eq(wb_in.dat)
1101                 comb += wr_addr.eq(r1.store_row)
1102                 comb += wr_sel.eq(~0) # all 1s
1103
1104             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1105                       & wb_in.ack & (replace_way == i)):
1106                 comb += do_write.eq(1)
1107
1108                 # Mask write selects with do_write since BRAM
1109                 # doesn't have a global write-enable
1110                 with m.If(do_write):
1111                     comb += wr_sel_m.eq(wr_sel)
1112
1113     # Cache hit synchronous machine for the easy case.
1114     # This handles load hits.
1115     # It also handles error cases (TLB miss, cache paradox)
1116     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1117                         req_hit_way, req_index, access_ok,
1118                         tlb_hit, tlb_hit_way, tlb_req_index):
1119
1120         comb = m.d.comb
1121         sync = m.d.sync
1122
1123         with m.If(req_op != Op.OP_NONE):
1124             #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
1125             #      f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
1126             #     )
1127             pass
1128
1129         with m.If(r0_valid):
1130             sync += r1.mmu_req.eq(r0.mmu_req)
1131
1132         # Fast path for load/store hits.
1133         # Set signals for the writeback controls.
1134         sync += r1.hit_way.eq(req_hit_way)
1135         sync += r1.hit_index.eq(req_index)
1136
1137         with m.If(req_op == Op.OP_LOAD_HIT):
1138             sync += r1.hit_load_valid.eq(1)
1139         with m.Else():
1140             sync += r1.hit_load_valid.eq(0)
1141
1142         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1143             sync += r1.cache_hit.eq(1)
1144         with m.Else():
1145             sync += r1.cache_hit.eq(0)
1146
1147         with m.If(req_op == Op.OP_BAD):
1148             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1149             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1150             sync += r1.ls_error.eq(~r0.mmu_req)
1151             sync += r1.mmu_error.eq(r0.mmu_req)
1152             sync += r1.cache_paradox.eq(access_ok)
1153
1154             with m.Else():
1155                 sync += r1.ls_error.eq(0)
1156                 sync += r1.mmu_error.eq(0)
1157                 sync += r1.cache_paradox.eq(0)
1158
1159         with m.If(req_op == Op.OP_STCX_FAIL):
1160             r1.stcx_fail.eq(1)
1161         with m.Else():
1162             sync += r1.stcx_fail.eq(0)
1163
1164         # Record TLB hit information for updating TLB PLRU
1165         sync += r1.tlb_hit.eq(tlb_hit)
1166         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1167         sync += r1.tlb_hit_index.eq(tlb_req_index)
1168
1169     # Memory accesses are handled by this state machine:
1170     #
1171     #   * Cache load miss/reload (in conjunction with "rams")
1172     #   * Load hits for non-cachable forms
1173     #   * Stores (the collision case is handled in "rams")
1174     #
1175     # All wishbone requests generation is done here.
1176     # This machine operates at stage 1.
1177     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1178                     cache_valid_bits, r0, replace_way,
1179                     req_hit_way, req_same_tag,
1180                     r0_valid, req_op, cache_tag, req_go, ra):
1181
1182         comb = m.d.comb
1183         sync = m.d.sync
1184         wb_in = self.wb_in
1185
1186         req         = MemAccessRequest()
1187         acks        = Signal(3)
1188         adjust_acks = Signal(3)
1189         stbs_done = Signal()
1190
1191         sync += r1.use_forward1.eq(use_forward1_next)
1192         sync += r1.forward_sel.eq(0)
1193
1194         with m.If(use_forward1_next):
1195             sync += r1.forward_sel.eq(r1.req.byte_sel)
1196         with m.Elif(use_forward2_next):
1197             sync += r1.forward_sel.eq(r1.forward_sel1)
1198
1199         sync += r1.forward_data2.eq(r1.forward_data1)
1200         with m.If(r1.write_bram):
1201             sync += r1.forward_data1.eq(r1.req.data)
1202             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1203             sync += r1.forward_way1.eq(r1.req.hit_way)
1204             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1205             sync += r1.forward_valid1.eq(1)
1206         with m.Else():
1207             with m.If(r1.dcbz):
1208                 sync += r1.forward_data1.eq(0)
1209             with m.Else():
1210                 sync += r1.forward_data1.eq(wb_in.dat)
1211             sync += r1.forward_sel1.eq(~0) # all 1s
1212             sync += r1.forward_way1.eq(replace_way)
1213             sync += r1.forward_row1.eq(r1.store_row)
1214             sync += r1.forward_valid1.eq(0)
1215
1216         # One cycle pulses reset
1217         sync += r1.slow_valid.eq(0)
1218         sync += r1.write_bram.eq(0)
1219         sync += r1.inc_acks.eq(0)
1220         sync += r1.dec_acks.eq(0)
1221
1222         sync += r1.ls_valid.eq(0)
1223         # complete tlbies and TLB loads in the third cycle
1224         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1225
1226         with m.If((req_op == Op.OP_LOAD_HIT)
1227                   | (req_op == Op.OP_STCX_FAIL)):
1228             with m.If(~r0.mmu_req):
1229                 sync += r1.ls_valid.eq(1)
1230             with m.Else():
1231                 sync += r1.mmu_done.eq(1)
1232
1233         with m.If(r1.write_tag):
1234             # Store new tag in selected way
1235             for i in range(NUM_WAYS):
1236                 with m.If(i == replace_way):
1237                     ct = Signal(TAG_RAM_WIDTH)
1238                     comb += ct.eq(cache_tag[r1.store_index])
1239                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1240                     sync += cache_tag[r1.store_index].eq(ct)
1241             sync += r1.store_way.eq(replace_way)
1242             sync += r1.write_tag.eq(0)
1243
1244         # Take request from r1.req if there is one there,
1245         # else from req_op, ra, etc.
1246         with m.If(r1.full):
1247             comb += req.eq(r1.req)
1248         with m.Else():
1249             comb += req.op.eq(req_op)
1250             comb += req.valid.eq(req_go)
1251             comb += req.mmu_req.eq(r0.mmu_req)
1252             comb += req.dcbz.eq(r0.req.dcbz)
1253             comb += req.real_addr.eq(ra)
1254
1255             with m.If(~r0.req.dcbz):
1256                 comb += req.data.eq(r0.req.data)
1257             with m.Else():
1258                 comb += req.data.eq(0)
1259
1260             # Select all bytes for dcbz
1261             # and for cacheable loads
1262             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1263                 comb += req.byte_sel.eq(~0) # all 1s
1264             with m.Else():
1265                 comb += req.byte_sel.eq(r0.req.byte_sel)
1266             comb += req.hit_way.eq(req_hit_way)
1267             comb += req.same_tag.eq(req_same_tag)
1268
1269             # Store the incoming request from r0,
1270             # if it is a slow request
1271             # Note that r1.full = 1 implies req_op = OP_NONE
1272             with m.If((req_op == Op.OP_LOAD_MISS)
1273                       | (req_op == Op.OP_LOAD_NC)
1274                       | (req_op == Op.OP_STORE_MISS)
1275                       | (req_op == Op.OP_STORE_HIT)):
1276                 sync += r1.req.eq(req)
1277                 sync += r1.full.eq(1)
1278
1279         # Main state machine
1280         with m.Switch(r1.state):
1281
1282             with m.Case(State.IDLE):
1283 # XXX check 'left downto.  probably means len(r1.wb.adr)
1284 #                     r1.wb.adr <= req.real_addr(
1285 #                                   r1.wb.adr'left downto 0
1286 #                                  );
1287                 sync += r1.wb.adr.eq(req.real_addr)
1288                 sync += r1.wb.sel.eq(req.byte_sel)
1289                 sync += r1.wb.dat.eq(req.data)
1290                 sync += r1.dcbz.eq(req.dcbz)
1291
1292                 # Keep track of our index and way
1293                 # for subsequent stores.
1294                 sync += r1.store_index.eq(get_index(req.real_addr))
1295                 sync += r1.store_row.eq(get_row(req.real_addr))
1296                 sync += r1.end_row_ix.eq(
1297                          get_row_of_line(get_row(req.real_addr))
1298                         )
1299                 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1300                 sync += r1.req.same_tag.eq(1)
1301
1302                 with m.If(req.op == Op.OP_STORE_HIT):
1303                     sync += r1.store_way.eq(req.hit_way)
1304
1305                 # Reset per-row valid bits,
1306                 # ready for handling OP_LOAD_MISS
1307                 for i in range(ROW_PER_LINE):
1308                     sync += r1.rows_valid[i].eq(0)
1309
1310                 with m.Switch(req.op):
1311                     with m.Case(Op.OP_LOAD_HIT):
1312                         # stay in IDLE state
1313                         pass
1314
1315                     with m.Case(Op.OP_LOAD_MISS):
1316                         #Display(f"cache miss real addr:" \
1317                         #      f"{req_real_addr}" \
1318                         #      f" idx:{get_index(req_real_addr)}" \
1319                         #      f" tag:{get_tag(req.real_addr)}")
1320                         pass
1321
1322                         # Start the wishbone cycle
1323                         sync += r1.wb.we.eq(0)
1324                         sync += r1.wb.cyc.eq(1)
1325                         sync += r1.wb.stb.eq(1)
1326
1327                         # Track that we had one request sent
1328                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1329                         sync += r1.write_tag.eq(1)
1330
1331                     with m.Case(Op.OP_LOAD_NC):
1332                         sync += r1.wb.cyc.eq(1)
1333                         sync += r1.wb.stb.eq(1)
1334                         sync += r1.wb.we.eq(0)
1335                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1336
1337                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1338                         with m.If(~req.dcbz):
1339                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1340                             sync += r1.acks_pending.eq(1)
1341                             sync += r1.full.eq(0)
1342                             sync += r1.slow_valid.eq(1)
1343
1344                             with m.If(~req.mmu_req):
1345                                 sync += r1.ls_valid.eq(1)
1346                             with m.Else():
1347                                 sync += r1.mmu_done.eq(1)
1348
1349                             with m.If(req.op == Op.OP_STORE_HIT):
1350                                 sync += r1.write_bram.eq(1)
1351                         with m.Else():
1352                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1353
1354                             with m.If(req.op == Op.OP_STORE_MISS):
1355                                 sync += r1.write_tag.eq(1)
1356
1357                         sync += r1.wb.we.eq(1)
1358                         sync += r1.wb.cyc.eq(1)
1359                         sync += r1.wb.stb.eq(1)
1360
1361                     # OP_NONE and OP_BAD do nothing
1362                     # OP_BAD & OP_STCX_FAIL were
1363                     # handled above already
1364                     with m.Case(Op.OP_NONE):
1365                         pass
1366                     with m.Case(Op.OP_BAD):
1367                         pass
1368                     with m.Case(Op.OP_STCX_FAIL):
1369                         pass
1370
1371             with m.Case(State.RELOAD_WAIT_ACK):
1372                 # Requests are all sent if stb is 0
1373                 comb += stbs_done.eq(~r1.wb.stb)
1374
1375                 with m.If(~wb_in.stall & ~stbs_done):
1376                     # That was the last word?
1377                     # We are done sending.
1378                     # Clear stb and set stbs_done
1379                     # so we can handle an eventual
1380                     # last ack on the same cycle.
1381                     with m.If(is_last_row_addr(
1382                               r1.wb.adr, r1.end_row_ix)):
1383                         sync += r1.wb.stb.eq(0)
1384                         comb += stbs_done.eq(0)
1385
1386                     # Calculate the next row address in the current cache line
1387                     rarange = r1.wb.adr[ROW_OFF_BITS : LINE_OFF_BITS]
1388                     sync += rarange.eq(rarange + 1)
1389
1390                 # Incoming acks processing
1391                 sync += r1.forward_valid1.eq(wb_in.ack)
1392                 with m.If(wb_in.ack):
1393                     # XXX needs an Array bit-accessor here
1394                     sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1395
1396                     # If this is the data we were looking for,
1397                     # we can complete the request next cycle.
1398                     # Compare the whole address in case the
1399                     # request in r1.req is not the one that
1400                     # started this refill.
1401                     with m.If(r1.full & r1.req.same_tag &
1402                               ((r1.dcbz & r1.req.dcbz) |
1403                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1404                                 (r1.store_row == get_row(r1.req.real_addr))):
1405                         sync += r1.full.eq(0)
1406                         sync += r1.slow_valid.eq(1)
1407                         with m.If(~r1.mmu_req):
1408                             sync += r1.ls_valid.eq(1)
1409                         with m.Else():
1410                             sync += r1.mmu_done.eq(1)
1411                         sync += r1.forward_sel.eq(~0) # all 1s
1412                         sync += r1.use_forward1.eq(1)
1413
1414                     # Check for completion
1415                     with m.If(stbs_done & is_last_row(r1.store_row,
1416                                                       r1.end_row_ix)):
1417                         # Complete wishbone cycle
1418                         sync += r1.wb.cyc.eq(0)
1419
1420                         # Cache line is now valid
1421                         cv = Signal(INDEX_BITS)
1422                         sync += cv.eq(cache_valid_bits[r1.store_index])
1423                         sync += cv.bit_select(r1.store_way, 1).eq(1)
1424                         sync += r1.state.eq(State.IDLE)
1425
1426                     # Increment store row counter
1427                     sync += r1.store_row.eq(next_row(r1.store_row))
1428
1429             with m.Case(State.STORE_WAIT_ACK):
1430                 comb += stbs_done.eq(~r1.wb.stb)
1431                 comb += acks.eq(r1.acks_pending)
1432
1433                 with m.If(r1.inc_acks != r1.dec_acks):
1434                     with m.If(r1.inc_acks):
1435                         comb += adjust_acks.eq(acks + 1)
1436                     with m.Else():
1437                         comb += adjust_acks.eq(acks - 1)
1438                 with m.Else():
1439                     comb += adjust_acks.eq(acks)
1440
1441                 sync += r1.acks_pending.eq(adjust_acks)
1442
1443                 # Clear stb when slave accepted request
1444                 with m.If(~wb_in.stall):
1445                     # See if there is another store waiting
1446                     # to be done which is in the same real page.
1447                     with m.If(req.valid):
1448                         ra = req.real_addr[0:SET_SIZE_BITS]
1449                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1450                         sync += r1.wb.dat.eq(req.data)
1451                         sync += r1.wb.sel.eq(req.byte_sel)
1452
1453                     with m.Elif((adjust_acks < 7) & req.same_tag &
1454                                 ((req.op == Op.OP_STORE_MISS)
1455                                  | (req.op == Op.OP_STORE_HIT))):
1456                         sync += r1.wb.stb.eq(1)
1457                         comb += stbs_done.eq(0)
1458
1459                         with m.If(req.op == Op.OP_STORE_HIT):
1460                             sync += r1.write_bram.eq(1)
1461                         sync += r1.full.eq(0)
1462                         sync += r1.slow_valid.eq(1)
1463
1464                         # Store requests never come from the MMU
1465                         sync += r1.ls_valid.eq(1)
1466                         comb += stbs_done.eq(0)
1467                         sync += r1.inc_acks.eq(1)
1468                     with m.Else():
1469                         sync += r1.wb.stb.eq(0)
1470                         comb += stbs_done.eq(1)
1471
1472                 # Got ack ? See if complete.
1473                 with m.If(wb_in.ack):
1474                     with m.If(stbs_done & (adjust_acks == 1)):
1475                         sync += r1.state.eq(State.IDLE)
1476                         sync += r1.wb.cyc.eq(0)
1477                         sync += r1.wb.stb.eq(0)
1478                     sync += r1.dec_acks.eq(1)
1479
1480             with m.Case(State.NC_LOAD_WAIT_ACK):
1481                 # Clear stb when slave accepted request
1482                 with m.If(~wb_in.stall):
1483                     sync += r1.wb.stb.eq(0)
1484
1485                 # Got ack ? complete.
1486                 with m.If(wb_in.ack):
1487                     sync += r1.state.eq(State.IDLE)
1488                     sync += r1.full.eq(0)
1489                     sync += r1.slow_valid.eq(1)
1490
1491                     with m.If(~r1.mmu_req):
1492                         sync += r1.ls_valid.eq(1)
1493                     with m.Else():
1494                         sync += r1.mmu_done.eq(1)
1495
1496                     sync += r1.forward_sel.eq(~0) # all 1s
1497                     sync += r1.use_forward1.eq(1)
1498                     sync += r1.wb.cyc.eq(0)
1499                     sync += r1.wb.stb.eq(0)
1500
1501     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1502
1503         sync = m.d.sync
1504         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1505
1506         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1507                                stall_out, req_op[:3], d_out.valid, d_out.error,
1508                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1509                                r1.wb.adr[3:6]))
1510
1511     def elaborate(self, platform):
1512
1513         m = Module()
1514         comb = m.d.comb
1515
1516         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1517         cache_tags       = CacheTagArray()
1518         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1519         cache_valid_bits = CacheValidBitsArray()
1520
1521         # TODO attribute ram_style : string;
1522         # TODO attribute ram_style of cache_tags : signal is "distributed";
1523
1524         """note: these are passed to nmigen.hdl.Memory as "attributes".
1525            don't know how, just that they are.
1526         """
1527         dtlb_valid_bits = TLBValidBitsArray()
1528         dtlb_tags       = TLBTagsArray()
1529         dtlb_ptes       = TLBPtesArray()
1530         # TODO attribute ram_style of
1531         #  dtlb_tags : signal is "distributed";
1532         # TODO attribute ram_style of
1533         #  dtlb_ptes : signal is "distributed";
1534
1535         r0      = RegStage0("r0")
1536         r0_full = Signal()
1537
1538         r1 = RegStage1("r1")
1539
1540         reservation = Reservation()
1541
1542         # Async signals on incoming request
1543         req_index    = Signal(INDEX_BITS)
1544         req_row      = Signal(ROW_BITS)
1545         req_hit_way  = Signal(WAY_BITS)
1546         req_tag      = Signal(TAG_BITS)
1547         req_op       = Signal(Op)
1548         req_data     = Signal(64)
1549         req_same_tag = Signal()
1550         req_go       = Signal()
1551
1552         early_req_row     = Signal(ROW_BITS)
1553
1554         cancel_store      = Signal()
1555         set_rsrv          = Signal()
1556         clear_rsrv        = Signal()
1557
1558         r0_valid          = Signal()
1559         r0_stall          = Signal()
1560
1561         use_forward1_next = Signal()
1562         use_forward2_next = Signal()
1563
1564         cache_out         = CacheRamOut()
1565
1566         plru_victim       = PLRUOut()
1567         replace_way       = Signal(WAY_BITS)
1568
1569         # Wishbone read/write/cache write formatting signals
1570         bus_sel           = Signal(8)
1571
1572         # TLB signals
1573         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1574         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1575         tlb_valid_way = Signal(TLB_NUM_WAYS)
1576         tlb_req_index = Signal(TLB_SET_BITS)
1577         tlb_hit       = Signal()
1578         tlb_hit_way   = Signal(TLB_WAY_BITS)
1579         pte           = Signal(TLB_PTE_BITS)
1580         ra            = Signal(REAL_ADDR_BITS)
1581         valid_ra      = Signal()
1582         perm_attr     = PermAttr()
1583         rc_ok         = Signal()
1584         perm_ok       = Signal()
1585         access_ok     = Signal()
1586
1587         tlb_plru_victim = TLBPLRUOut()
1588
1589         # we don't yet handle collisions between loadstore1 requests
1590         # and MMU requests
1591         comb += self.m_out.stall.eq(0)
1592
1593         # Hold off the request in r0 when r1 has an uncompleted request
1594         comb += r0_stall.eq(r0_full & r1.full)
1595         comb += r0_valid.eq(r0_full & ~r1.full)
1596         comb += self.stall_out.eq(r0_stall)
1597
1598         # Wire up wishbone request latch out of stage 1
1599         comb += self.wb_out.eq(r1.wb)
1600
1601         # call sub-functions putting everything together, using shared
1602         # signals established above
1603         self.stage_0(m, r0, r1, r0_full)
1604         self.tlb_read(m, r0_stall, tlb_valid_way,
1605                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1606                       dtlb_tags, dtlb_ptes)
1607         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1608                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1609                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1610         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1611                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1612                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1613         self.maybe_plrus(m, r1, plru_victim)
1614         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1615         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1616         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1617                            r0_valid, r1, cache_valid_bits, replace_way,
1618                            use_forward1_next, use_forward2_next,
1619                            req_hit_way, plru_victim, rc_ok, perm_attr,
1620                            valid_ra, perm_ok, access_ok, req_op, req_go,
1621                            tlb_pte_way,
1622                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1623                            cancel_store, req_same_tag, r0_stall, early_req_row)
1624         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1625                            r0_valid, r0, reservation)
1626         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1627                            reservation, r0)
1628         self.writeback_control(m, r1, cache_out)
1629         self.rams(m, r1, early_req_row, cache_out, replace_way)
1630         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1631                         req_hit_way, req_index, access_ok,
1632                         tlb_hit, tlb_hit_way, tlb_req_index)
1633         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1634                     cache_valid_bits, r0, replace_way,
1635                     req_hit_way, req_same_tag,
1636                          r0_valid, req_op, cache_tags, req_go, ra)
1637         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1638
1639         return m
1640
1641
1642 def dcache_sim(dut):
1643     # clear stuff
1644     yield dut.d_in.valid.eq(0)
1645     yield dut.d_in.load.eq(0)
1646     yield dut.d_in.nc.eq(0)
1647     yield dut.d_in.addr.eq(0)
1648     yield dut.d_in.data.eq(0)
1649     yield dut.m_in.valid.eq(0)
1650     yield dut.m_in.addr.eq(0)
1651     yield dut.m_in.pte.eq(0)
1652     # wait 4 * clk_period
1653     yield
1654     yield
1655     yield
1656     yield
1657
1658     # Cacheable read of address 4
1659     yield dut.d_in.load.eq(1)
1660     yield dut.d_in.nc.eq(0)
1661     yield dut.d_in.addr.eq(0x0000000000000004)
1662     yield dut.d_in.valid.eq(1)
1663     yield
1664     yield dut.d_in.valid.eq(0)
1665     yield
1666     while not (yield dut.d_out.valid):
1667         yield
1668     data = yield dut.d_out.data
1669     addr = yield dut.d_in.addr
1670     assert data == 0x0000000100000000, \
1671         f"data @%x=%x expected 0x0000000100000000" % (data, addr)
1672
1673     # Cacheable read of address 30
1674     yield dut.d_in.load.eq(1)
1675     yield dut.d_in.nc.eq(0)
1676     yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
1677     yield dut.d_in.valid.eq(1)
1678     yield
1679     yield dut.d_in.valid.eq(0)
1680     yield
1681     while not (yield dut.d_out.valid):
1682         yield
1683     data = yield dut.d_out.data
1684     addr = yield dut.d_in.addr
1685     assert data == 0x0000000D0000000C, \
1686         f"data @%x=%x expected 0000000D0000000C" % (data, addr)
1687
1688     # Non-cacheable read of address 100
1689     yield dut.d_in.load.eq(1)
1690     yield dut.d_in.nc.eq(1)
1691     yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
1692     yield dut.d_in.valid.eq(1)
1693     yield
1694     yield dut.d_in.valid.eq(0)
1695     yield
1696     while not (yield dut.d_out.valid):
1697         yield
1698     data = yield dut.d_out.data
1699     addr = yield dut.d_in.addr
1700     assert data == 0x0000004100000040, \
1701         f"data @%x=%x expected 0000004100000040" % (data, addr)
1702
1703     yield
1704     yield
1705     yield
1706     yield
1707
1708
1709 def test_dcache():
1710     dut = DCache()
1711     vl = rtlil.convert(dut, ports=[])
1712     with open("test_dcache.il", "w") as f:
1713         f.write(vl)
1714
1715     memory = Memory(width=64, depth=16*8, init=range(128))
1716     sram = SRAM(memory=memory, granularity=8)
1717
1718     m = Module()
1719     m.submodules.dcache = dut
1720     m.submodules.sram = sram
1721
1722     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1723     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1724     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1725     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1726     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1727     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1728
1729     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1730     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1731
1732     # nmigen Simulation
1733     sim = Simulator(m)
1734     sim.add_clock(1e-6)
1735
1736     sim.add_sync_process(wrap(dcache_sim(dut)))
1737     with sim.write_vcd('test_dcache.vcd'):
1738         sim.run()
1739
1740 if __name__ == '__main__':
1741     test_dcache()
1742