src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 from nmigen.cli import main
  11 from nmutil.iocontrol import RecordObject
  12 from nmutil.util import wrap
  13 from nmigen.utils import log2_int
  14 from nmigen.cli import rtlil
  15
  16 if True:
  17     from nmigen.back.pysim import Simulator, Delay, Settle
  18 else:
  19     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  20
  21 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  22                                      DCacheToLoadStore1Type,
  23                                      MMUToDCacheType,
  24                                      DCacheToMMUType)
  25
  26 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  27                                 WBAddrType, WBDataType, WBSelType,
  28                                 WBMasterOut, WBSlaveOut,
  29                                 WBMasterOutVector, WBSlaveOutVector,
  30                                 WBIOMasterOut, WBIOSlaveOut)
  31
  32 from soc.experiment.cache_ram import CacheRam
  33 from soc.experiment.plru import PLRU
  34
  35
  36 # TODO: make these parameters of DCache at some point
  37 LINE_SIZE = 64    # Line size in bytes
  38 NUM_LINES = 32    # Number of lines in a set
  39 NUM_WAYS = 4      # Number of ways
  40 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  41 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  42 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  43 LOG_LENGTH = 0    # Non-zero to enable log data collection
  44
  45 # BRAM organisation: We never access more than
  46 #     -- WB_DATA_BITS at a time so to save
  47 #     -- resources we make the array only that wide, and
  48 #     -- use consecutive indices for to make a cache "line"
  49 #     --
  50 #     -- ROW_SIZE is the width in bytes of the BRAM
  51 #     -- (based on WB, so 64-bits)
  52 ROW_SIZE = WB_DATA_BITS // 8;
  53
  54 # ROW_PER_LINE is the number of row (wishbone
  55 # transactions) in a line
  56 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  57
  58 # BRAM_ROWS is the number of rows in BRAM needed
  59 # to represent the full dcache
  60 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  61
  62
  63 # Bit fields counts in the address
  64
  65 # REAL_ADDR_BITS is the number of real address
  66 # bits that we store
  67 REAL_ADDR_BITS = 56
  68
  69 # ROW_BITS is the number of bits to select a row
  70 ROW_BITS = log2_int(BRAM_ROWS)
  71
  72 # ROW_LINE_BITS is the number of bits to select
  73 # a row within a line
  74 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  75
  76 # LINE_OFF_BITS is the number of bits for
  77 # the offset in a cache line
  78 LINE_OFF_BITS = log2_int(LINE_SIZE)
  79
  80 # ROW_OFF_BITS is the number of bits for
  81 # the offset in a row
  82 ROW_OFF_BITS = log2_int(ROW_SIZE)
  83
  84 # INDEX_BITS is the number if bits to
  85 # select a cache line
  86 INDEX_BITS = log2_int(NUM_LINES)
  87
  88 # SET_SIZE_BITS is the log base 2 of the set size
  89 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
  90
  91 # TAG_BITS is the number of bits of
  92 # the tag part of the address
  93 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
  94
  95 # TAG_WIDTH is the width in bits of each way of the tag RAM
  96 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
  97
  98 # WAY_BITS is the number of bits to select a way
  99 WAY_BITS = log2_int(NUM_WAYS)
 100
 101 # Example of layout for 32 lines of 64 bytes:
 102 #
 103 # ..  tag    |index|  line  |
 104 # ..         |   row   |    |
 105 # ..         |     |---|    | ROW_LINE_BITS  (3)
 106 # ..         |     |--- - --| LINE_OFF_BITS (6)
 107 # ..         |         |- --| ROW_OFF_BITS  (3)
 108 # ..         |----- ---|    | ROW_BITS      (8)
 109 # ..         |-----|        | INDEX_BITS    (5)
 110 # .. --------|              | TAG_BITS      (45)
 111
 112 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 113
 114 def CacheTagArray():
 115     return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
 116
 117 def CacheValidBitsArray():
 118     return Array(Signal(INDEX_BITS) for x in range(NUM_LINES))
 119
 120 def RowPerLineValidArray():
 121     return Array(Signal() for x in range(ROW_PER_LINE))
 122
 123 # L1 TLB
 124 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 125 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 126 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 127 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 128 TLB_PTE_BITS     = 64
 129 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 130
 131 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 132 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
 133 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
 134 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
 135 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 136 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 137         "geometry bits don't add up"
 138 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 139         "geometry bits don't add up"
 140 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 141          "geometry bits don't add up"
 142 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 143 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 144
 145
 146 def TLBValidBitsArray():
 147     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 148
 149 def TLBTagEAArray():
 150     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 151
 152 def TLBTagsArray():
 153     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 154
 155 def TLBPtesArray():
 156     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 157
 158 def HitWaySet():
 159     return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
 160
 161 # Cache RAM interface
 162 def CacheRamOut():
 163     return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
 164
 165 # PLRU output interface
 166 def PLRUOut():
 167     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 168
 169 # TLB PLRU output interface
 170 def TLBPLRUOut():
 171     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 172
 173 # Helper functions to decode incoming requests
 174 #
 175 # Return the cache line index (tag index) for an address
 176 def get_index(addr):
 177     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 178
 179 # Return the cache row index (data memory) for an address
 180 def get_row(addr):
 181     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 182
 183 # Return the index of a row within a line
 184 def get_row_of_line(row):
 185     return row[:ROW_LINE_BITS]
 186
 187 # Returns whether this is the last row of a line
 188 def is_last_row_addr(addr, last):
 189     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 190
 191 # Returns whether this is the last row of a line
 192 def is_last_row(row, last):
 193     return get_row_of_line(row) == last
 194
 195 # Return the next row in the current cache line. We use a
 196 # dedicated function in order to limit the size of the
 197 # generated adder to be only the bits within a cache line
 198 # (3 bits with default settings)
 199 def next_row(row):
 200     row_v = row[0:ROW_LINE_BITS] + 1
 201     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 202
 203 # Get the tag value from the address
 204 def get_tag(addr):
 205     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 206
 207 # Read a tag from a tag memory row
 208 def read_tag(way, tagset):
 209     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 210
 211 # Read a TLB tag from a TLB tag memory row
 212 def read_tlb_tag(way, tags):
 213     return tags.word_select(way, TLB_EA_TAG_BITS)
 214
 215 # Write a TLB tag to a TLB tag memory row
 216 def write_tlb_tag(way, tags, tag):
 217     return read_tlb_tag(way, tags).eq(tag)
 218
 219 # Read a PTE from a TLB PTE memory row
 220 def read_tlb_pte(way, ptes):
 221     return ptes.word_select(way, TLB_PTE_BITS)
 222
 223 def write_tlb_pte(way, ptes, newpte):
 224     return read_tlb_pte(way, ptes).eq(newpte)
 225
 226
 227 # Record for storing permission, attribute, etc. bits from a PTE
 228 class PermAttr(RecordObject):
 229     def __init__(self):
 230         super().__init__()
 231         self.reference = Signal()
 232         self.changed   = Signal()
 233         self.nocache   = Signal()
 234         self.priv      = Signal()
 235         self.rd_perm   = Signal()
 236         self.wr_perm   = Signal()
 237
 238
 239 def extract_perm_attr(pte):
 240     pa = PermAttr()
 241     pa.reference = pte[8]
 242     pa.changed   = pte[7]
 243     pa.nocache   = pte[5]
 244     pa.priv      = pte[3]
 245     pa.rd_perm   = pte[2]
 246     pa.wr_perm   = pte[1]
 247     return pa;
 248
 249
 250 # Type of operation on a "valid" input
 251 @unique
 252 class Op(Enum):
 253     OP_NONE       = 0
 254     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 255     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 256     OP_LOAD_HIT   = 3 # Cache hit on load
 257     OP_LOAD_MISS  = 4 # Load missing cache
 258     OP_LOAD_NC    = 5 # Non-cachable load
 259     OP_STORE_HIT  = 6 # Store hitting cache
 260     OP_STORE_MISS = 7 # Store missing cache
 261
 262
 263 # Cache state machine
 264 @unique
 265 class State(Enum):
 266     IDLE             = 0 # Normal load hit processing
 267     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 268     STORE_WAIT_ACK   = 2 # Store wait ack
 269     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 270
 271
 272 # Dcache operations:
 273 #
 274 # In order to make timing, we use the BRAMs with
 275 # an output buffer, which means that the BRAM
 276 # output is delayed by an extra cycle.
 277 #
 278 # Thus, the dcache has a 2-stage internal pipeline
 279 # for cache hits with no stalls.
 280 #
 281 # All other operations are handled via stalling
 282 # in the first stage.
 283 #
 284 # The second stage can thus complete a hit at the same
 285 # time as the first stage emits a stall for a complex op.
 286 #
 287 # Stage 0 register, basically contains just the latched request
 288
 289 class RegStage0(RecordObject):
 290     def __init__(self):
 291         super().__init__()
 292         self.req     = LoadStore1ToDCacheType()
 293         self.tlbie   = Signal()
 294         self.doall   = Signal()
 295         self.tlbld   = Signal()
 296         self.mmu_req = Signal() # indicates source of request
 297
 298
 299 class MemAccessRequest(RecordObject):
 300     def __init__(self):
 301         super().__init__()
 302         self.op        = Signal(Op)
 303         self.valid     = Signal()
 304         self.dcbz      = Signal()
 305         self.real_addr = Signal(REAL_ADDR_BITS)
 306         self.data      = Signal(64)
 307         self.byte_sel  = Signal(8)
 308         self.hit_way   = Signal(WAY_BITS)
 309         self.same_tag  = Signal()
 310         self.mmu_req   = Signal()
 311
 312
 313 # First stage register, contains state for stage 1 of load hits
 314 # and for the state machine used by all other operations
 315 class RegStage1(RecordObject):
 316     def __init__(self):
 317         super().__init__()
 318         # Info about the request
 319         self.full             = Signal() # have uncompleted request
 320         self.mmu_req          = Signal() # request is from MMU
 321         self.req              = MemAccessRequest()
 322
 323         # Cache hit state
 324         self.hit_way          = Signal(WAY_BITS)
 325         self.hit_load_valid   = Signal()
 326         self.hit_index        = Signal(INDEX_BITS)
 327         self.cache_hit        = Signal()
 328
 329         # TLB hit state
 330         self.tlb_hit          = Signal()
 331         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 332         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 333
 334         # 2-stage data buffer for data forwarded from writes to reads
 335         self.forward_data1    = Signal(64)
 336         self.forward_data2    = Signal(64)
 337         self.forward_sel1     = Signal(8)
 338         self.forward_valid1   = Signal()
 339         self.forward_way1     = Signal(WAY_BITS)
 340         self.forward_row1     = Signal(ROW_BITS)
 341         self.use_forward1     = Signal()
 342         self.forward_sel      = Signal(8)
 343
 344         # Cache miss state (reload state machine)
 345         self.state            = Signal(State)
 346         self.dcbz             = Signal()
 347         self.write_bram       = Signal()
 348         self.write_tag        = Signal()
 349         self.slow_valid       = Signal()
 350         self.wb               = WBMasterOut()
 351         self.reload_tag       = Signal(TAG_BITS)
 352         self.store_way        = Signal(WAY_BITS)
 353         self.store_row        = Signal(ROW_BITS)
 354         self.store_index      = Signal(INDEX_BITS)
 355         self.end_row_ix       = Signal(log2_int(ROW_LINE_BITS, False))
 356         self.rows_valid       = RowPerLineValidArray()
 357         self.acks_pending     = Signal(3)
 358         self.inc_acks         = Signal()
 359         self.dec_acks         = Signal()
 360
 361         # Signals to complete (possibly with error)
 362         self.ls_valid         = Signal()
 363         self.ls_error         = Signal()
 364         self.mmu_done         = Signal()
 365         self.mmu_error        = Signal()
 366         self.cache_paradox    = Signal()
 367
 368         # Signal to complete a failed stcx.
 369         self.stcx_fail        = Signal()
 370
 371
 372 # Reservation information
 373 class Reservation(RecordObject):
 374     def __init__(self):
 375         super().__init__()
 376         self.valid = Signal()
 377         self.addr  = Signal(64-LINE_OFF_BITS)
 378
 379
 380 class DTLBUpdate(Elaboratable):
 381     def __init__(self):
 382         self.tlbie    = Signal()
 383         self.tlbwe    = Signal()
 384         self.doall    = Signal()
 385         self.updated  = Signal()
 386         self.v_updated  = Signal()
 387         self.tlb_hit    = Signal()
 388         self.tlb_req_index = Signal(TLB_SET_BITS)
 389
 390         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 391         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 392         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 393         self.repl_way        = Signal(TLB_WAY_BITS)
 394         self.eatag           = Signal(TLB_EA_TAG_BITS)
 395         self.pte_data        = Signal(TLB_PTE_BITS)
 396
 397         self.dv = Signal(TLB_PTE_WAY_BITS)
 398
 399         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 400         self.pb_out = Signal(TLB_NUM_WAYS)
 401         self.db_out = Signal(TLB_PTE_WAY_BITS)
 402
 403     def elaborate(self, platform):
 404         m = Module()
 405         comb = m.d.comb
 406         sync = m.d.sync
 407
 408         tagset   = Signal(TLB_TAG_WAY_BITS)
 409         pteset   = Signal(TLB_PTE_WAY_BITS)
 410
 411         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 412
 413         with m.If(self.tlbie & self.doall):
 414             pass # clear all back in parent
 415         with m.Elif(self.tlbie):
 416             with m.If(self.tlb_hit):
 417                 comb += db_out.eq(self.dv)
 418                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 419                 comb += self.v_updated.eq(1)
 420
 421         with m.Elif(self.tlbwe):
 422
 423             comb += tagset.eq(self.tlb_tag_way)
 424             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 425             comb += tb_out.eq(tagset)
 426
 427             comb += pteset.eq(self.tlb_pte_way)
 428             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 429             comb += pb_out.eq(pteset)
 430
 431             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 432
 433             comb += self.updated.eq(1)
 434             comb += self.v_updated.eq(1)
 435
 436         return m
 437
 438     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 439                        r0_valid, r1, cache_valid_bits, replace_way,
 440                        use_forward1_next, use_forward2_next,
 441                        req_hit_way, plru_victim, rc_ok, perm_attr,
 442                        valid_ra, perm_ok, access_ok, req_op, req_go,
 443                        tlb_pte_way,
 444                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 445                        cancel_store, req_same_tag, r0_stall, early_req_row):
 446         """Cache request parsing and hit detection
 447         """
 448
 449 class DCachePendingHit(Elaboratable):
 450
 451     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 452                       cache_valid_idx, cache_tag_set,
 453                     req_addr,
 454                     hit_set):
 455
 456         self.go          = Signal()
 457         self.virt_mode   = Signal()
 458         self.is_hit      = Signal()
 459         self.tlb_hit     = Signal()
 460         self.hit_way     = Signal(WAY_BITS)
 461         self.rel_match   = Signal()
 462         self.req_index   = Signal(INDEX_BITS)
 463         self.reload_tag  = Signal(TAG_BITS)
 464
 465         self.tlb_hit_way = tlb_hit_way
 466         self.tlb_pte_way = tlb_pte_way
 467         self.tlb_valid_way = tlb_valid_way
 468         self.cache_valid_idx = cache_valid_idx
 469         self.cache_tag_set = cache_tag_set
 470         self.req_addr = req_addr
 471         self.hit_set = hit_set
 472
 473     def elaborate(self, platform):
 474         m = Module()
 475         comb = m.d.comb
 476         sync = m.d.sync
 477
 478         go = self.go
 479         virt_mode = self.virt_mode
 480         is_hit = self.is_hit
 481         tlb_pte_way = self.tlb_pte_way
 482         tlb_valid_way = self.tlb_valid_way
 483         cache_valid_idx = self.cache_valid_idx
 484         cache_tag_set = self.cache_tag_set
 485         req_addr = self.req_addr
 486         tlb_hit_way = self.tlb_hit_way
 487         tlb_hit = self.tlb_hit
 488         hit_set = self.hit_set
 489         hit_way = self.hit_way
 490         rel_match = self.rel_match
 491         req_index = self.req_index
 492         reload_tag = self.reload_tag
 493
 494         rel_matches = Array(Signal() for i in range(TLB_NUM_WAYS))
 495         hit_way_set = HitWaySet()
 496
 497         # Test if pending request is a hit on any way
 498         # In order to make timing in virtual mode,
 499         # when we are using the TLB, we compare each
 500         # way with each of the real addresses from each way of
 501         # the TLB, and then decide later which match to use.
 502
 503         with m.If(virt_mode):
 504             for j in range(TLB_NUM_WAYS):
 505                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 506                 s_hit       = Signal()
 507                 s_pte       = Signal(TLB_PTE_BITS)
 508                 s_ra        = Signal(REAL_ADDR_BITS)
 509                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 510                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 511                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 512                 comb += s_tag.eq(get_tag(s_ra))
 513
 514                 for i in range(NUM_WAYS):
 515                     is_tag_hit = Signal()
 516                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 517                                   (read_tag(i, cache_tag_set) == s_tag)
 518                                   & tlb_valid_way[j])
 519                     with m.If(is_tag_hit):
 520                         comb += hit_way_set[j].eq(i)
 521                         comb += s_hit.eq(1)
 522                 comb += hit_set[j].eq(s_hit)
 523                 with m.If(s_tag == reload_tag):
 524                     comb += rel_matches[j].eq(1)
 525             with m.If(tlb_hit):
 526                 comb += is_hit.eq(hit_set[tlb_hit_way])
 527                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 528                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 529         with m.Else():
 530             s_tag       = Signal(TAG_BITS)
 531             comb += s_tag.eq(get_tag(req_addr))
 532             for i in range(NUM_WAYS):
 533                 is_tag_hit = Signal()
 534                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 535                           read_tag(i, cache_tag_set) == s_tag)
 536                 with m.If(is_tag_hit):
 537                     comb += hit_way.eq(i)
 538                     comb += is_hit.eq(1)
 539             with m.If(s_tag == reload_tag):
 540                 comb += rel_match.eq(1)
 541
 542         return m
 543
 544
 545 class DCache(Elaboratable):
 546     """Set associative dcache write-through
 547     TODO (in no specific order):
 548     * See list in icache.vhdl
 549     * Complete load misses on the cycle when WB data comes instead of
 550       at the end of line (this requires dealing with requests coming in
 551       while not idle...)
 552     """
 553     def __init__(self):
 554         self.d_in      = LoadStore1ToDCacheType("d_in")
 555         self.d_out     = DCacheToLoadStore1Type("d_out")
 556
 557         self.m_in      = MMUToDCacheType()
 558         self.m_out     = DCacheToMMUType()
 559
 560         self.stall_out = Signal()
 561
 562         self.wb_out    = WBMasterOut()
 563         self.wb_in     = WBSlaveOut()
 564
 565         self.log_out   = Signal(20)
 566
 567     def stage_0(self, m, r0, r1, r0_full):
 568         """Latch the request in r0.req as long as we're not stalling
 569         """
 570         comb = m.d.comb
 571         sync = m.d.sync
 572         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 573
 574         r = RegStage0()
 575
 576         # TODO, this goes in unit tests and formal proofs
 577         with m.If(~(d_in.valid & m_in.valid)):
 578             #sync += Display("request collision loadstore vs MMU")
 579             pass
 580
 581         with m.If(m_in.valid):
 582             sync += r.req.valid.eq(1)
 583             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 584             sync += r.req.dcbz.eq(0)
 585             sync += r.req.nc.eq(0)
 586             sync += r.req.reserve.eq(0)
 587             sync += r.req.virt_mode.eq(1)
 588             sync += r.req.priv_mode.eq(1)
 589             sync += r.req.addr.eq(m_in.addr)
 590             sync += r.req.data.eq(m_in.pte)
 591             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 592             sync += r.tlbie.eq(m_in.tlbie)
 593             sync += r.doall.eq(m_in.doall)
 594             sync += r.tlbld.eq(m_in.tlbld)
 595             sync += r.mmu_req.eq(1)
 596         with m.Else():
 597             sync += r.req.eq(d_in)
 598             sync += r.tlbie.eq(0)
 599             sync += r.doall.eq(0)
 600             sync += r.tlbld.eq(0)
 601             sync += r.mmu_req.eq(0)
 602             with m.If(~(r1.full & r0_full)):
 603                 sync += r0.eq(r)
 604                 sync += r0_full.eq(r.req.valid)
 605
 606     def tlb_read(self, m, r0_stall, tlb_valid_way,
 607                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 608                  dtlb_tags, dtlb_ptes):
 609         """TLB
 610         Operates in the second cycle on the request latched in r0.req.
 611         TLB updates write the entry at the end of the second cycle.
 612         """
 613         comb = m.d.comb
 614         sync = m.d.sync
 615         m_in, d_in = self.m_in, self.d_in
 616
 617         index    = Signal(TLB_SET_BITS)
 618         addrbits = Signal(TLB_SET_BITS)
 619
 620         amin = TLB_LG_PGSZ
 621         amax = TLB_LG_PGSZ + TLB_SET_BITS
 622
 623         with m.If(m_in.valid):
 624             comb += addrbits.eq(m_in.addr[amin : amax])
 625         with m.Else():
 626             comb += addrbits.eq(d_in.addr[amin : amax])
 627         comb += index.eq(addrbits)
 628
 629         # If we have any op and the previous op isn't finished,
 630         # then keep the same output for next cycle.
 631         with m.If(~r0_stall):
 632             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 633             sync += tlb_tag_way.eq(dtlb_tags[index])
 634             sync += tlb_pte_way.eq(dtlb_ptes[index])
 635
 636     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 637         """Generate TLB PLRUs
 638         """
 639         comb = m.d.comb
 640         sync = m.d.sync
 641
 642         if TLB_NUM_WAYS == 0:
 643             return
 644         for i in range(TLB_SET_SIZE):
 645             # TLB PLRU interface
 646             tlb_plru        = PLRU(WAY_BITS)
 647             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 648             tlb_plru_acc_en = Signal()
 649
 650             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 651             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 652             comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 653             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 654
 655     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 656                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 657                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 658
 659         comb = m.d.comb
 660         sync = m.d.sync
 661
 662         hitway = Signal(TLB_WAY_BITS)
 663         hit    = Signal()
 664         eatag  = Signal(TLB_EA_TAG_BITS)
 665
 666         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 667         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 668         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 669
 670         for i in range(TLB_NUM_WAYS):
 671             is_tag_hit = Signal()
 672             comb += is_tag_hit.eq(tlb_valid_way[i]
 673                                   & read_tlb_tag(i, tlb_tag_way) == eatag)
 674             with m.If(is_tag_hit):
 675                 comb += hitway.eq(i)
 676                 comb += hit.eq(1)
 677
 678         comb += tlb_hit.eq(hit & r0_valid)
 679         comb += tlb_hit_way.eq(hitway)
 680
 681         with m.If(tlb_hit):
 682             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 683         with m.Else():
 684             comb += pte.eq(0)
 685         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 686         with m.If(r0.req.virt_mode):
 687             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 688                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 689                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 690             comb += perm_attr.eq(extract_perm_attr(pte))
 691         with m.Else():
 692             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 693                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 694
 695             comb += perm_attr.reference.eq(1)
 696             comb += perm_attr.changed.eq(1)
 697             comb += perm_attr.priv.eq(1)
 698             comb += perm_attr.nocache.eq(0)
 699             comb += perm_attr.rd_perm.eq(1)
 700             comb += perm_attr.wr_perm.eq(1)
 701
 702     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 703                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 704                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 705
 706         comb = m.d.comb
 707         sync = m.d.sync
 708
 709         tlbie    = Signal()
 710         tlbwe    = Signal()
 711
 712         comb += tlbie.eq(r0_valid & r0.tlbie)
 713         comb += tlbwe.eq(r0_valid & r0.tlbld)
 714
 715         m.submodules.tlb_update = d = DTLBUpdate()
 716         with m.If(tlbie & r0.doall):
 717             # clear all valid bits at once
 718             for i in range(TLB_SET_SIZE):
 719                 sync += dtlb_valid_bits[i].eq(0)
 720         with m.If(d.updated):
 721             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 722             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 723         with m.If(d.v_updated):
 724             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 725
 726         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 727
 728         comb += d.tlbie.eq(tlbie)
 729         comb += d.tlbwe.eq(tlbwe)
 730         comb += d.doall.eq(r0.doall)
 731         comb += d.tlb_hit.eq(tlb_hit)
 732         comb += d.tlb_hit_way.eq(tlb_hit_way)
 733         comb += d.tlb_tag_way.eq(tlb_tag_way)
 734         comb += d.tlb_pte_way.eq(tlb_pte_way)
 735         comb += d.tlb_req_index.eq(tlb_req_index)
 736
 737         with m.If(tlb_hit):
 738             comb += d.repl_way.eq(tlb_hit_way)
 739         with m.Else():
 740             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 741         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 742         comb += d.pte_data.eq(r0.req.data)
 743
 744     def maybe_plrus(self, m, r1, plru_victim):
 745         """Generate PLRUs
 746         """
 747         comb = m.d.comb
 748         sync = m.d.sync
 749
 750         if TLB_NUM_WAYS == 0:
 751             return
 752
 753         for i in range(NUM_LINES):
 754             # PLRU interface
 755             plru        = PLRU(WAY_BITS)
 756             setattr(m.submodules, "plru%d" % i, plru)
 757             plru_acc_en = Signal()
 758
 759             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 760             comb += plru.acc_en.eq(plru_acc_en)
 761             comb += plru.acc.eq(r1.hit_way)
 762             comb += plru_victim[i].eq(plru.lru_o)
 763
 764     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 765         """Cache tag RAM read port
 766         """
 767         comb = m.d.comb
 768         sync = m.d.sync
 769         m_in, d_in = self.m_in, self.d_in
 770
 771         index = Signal(INDEX_BITS)
 772
 773         with m.If(r0_stall):
 774             comb += index.eq(req_index)
 775         with m.Elif(m_in.valid):
 776             comb += index.eq(get_index(m_in.addr))
 777         with m.Else():
 778             comb += index.eq(get_index(d_in.addr))
 779         sync += cache_tag_set.eq(cache_tags[index])
 780
 781     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 782                        r0_valid, r1, cache_valid_bits, replace_way,
 783                        use_forward1_next, use_forward2_next,
 784                        req_hit_way, plru_victim, rc_ok, perm_attr,
 785                        valid_ra, perm_ok, access_ok, req_op, req_go,
 786                        tlb_pte_way,
 787                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 788                        cancel_store, req_same_tag, r0_stall, early_req_row):
 789         """Cache request parsing and hit detection
 790         """
 791
 792         comb = m.d.comb
 793         sync = m.d.sync
 794         m_in, d_in = self.m_in, self.d_in
 795
 796         is_hit      = Signal()
 797         hit_way     = Signal(WAY_BITS)
 798         op          = Signal(Op)
 799         opsel       = Signal(3)
 800         go          = Signal()
 801         nc          = Signal()
 802         hit_set     = Array(Signal() for i in range(TLB_NUM_WAYS))
 803         cache_valid_idx = Signal(INDEX_BITS)
 804
 805         # Extract line, row and tag from request
 806         comb += req_index.eq(get_index(r0.req.addr))
 807         comb += req_row.eq(get_row(r0.req.addr))
 808         comb += req_tag.eq(get_tag(ra))
 809
 810         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 811         comb += cache_valid_idx.eq(cache_valid_bits[req_index])
 812
 813         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 814                                 tlb_valid_way, tlb_hit_way,
 815                                 cache_valid_idx, cache_tag_set,
 816                                 r0.req.addr,
 817                                 hit_set)
 818
 819         comb += dc.tlb_hit.eq(tlb_hit)
 820         comb += dc.reload_tag.eq(r1.reload_tag)
 821         comb += dc.virt_mode.eq(r0.req.virt_mode)
 822         comb += dc.go.eq(go)
 823         comb += dc.req_index.eq(req_index)
 824         comb += is_hit.eq(dc.is_hit)
 825         comb += hit_way.eq(dc.hit_way)
 826         comb += req_same_tag.eq(dc.rel_match)
 827
 828         # See if the request matches the line currently being reloaded
 829         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 830                   (req_index == r1.store_index) & req_same_tag):
 831             # For a store, consider this a hit even if the row isn't
 832             # valid since it will be by the time we perform the store.
 833             # For a load, check the appropriate row valid bit.
 834             valid = r1.rows_valid[req_row % ROW_PER_LINE]
 835             comb += is_hit.eq(~r0.req.load | valid)
 836             comb += hit_way.eq(replace_way)
 837
 838         # Whether to use forwarded data for a load or not
 839         comb += use_forward1_next.eq(0)
 840         with m.If((get_row(r1.req.real_addr) == req_row) &
 841                   (r1.req.hit_way == hit_way)):
 842             # Only need to consider r1.write_bram here, since if we
 843             # are writing refill data here, then we don't have a
 844             # cache hit this cycle on the line being refilled.
 845             # (There is the possibility that the load following the
 846             # load miss that started the refill could be to the old
 847             # contents of the victim line, since it is a couple of
 848             # cycles after the refill starts before we see the updated
 849             # cache tag. In that case we don't use the bypass.)
 850             comb += use_forward1_next.eq(r1.write_bram)
 851         comb += use_forward2_next.eq(0)
 852         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 853             comb += use_forward2_next.eq(r1.forward_valid1)
 854
 855         # The way that matched on a hit
 856         comb += req_hit_way.eq(hit_way)
 857
 858         # The way to replace on a miss
 859         with m.If(r1.write_tag):
 860             comb += replace_way.eq(plru_victim[r1.store_index])
 861         with m.Else():
 862             comb += replace_way.eq(r1.store_way)
 863
 864         # work out whether we have permission for this access
 865         # NB we don't yet implement AMR, thus no KUAP
 866         comb += rc_ok.eq(perm_attr.reference
 867                          & (r0.req.load | perm_attr.changed)
 868                 )
 869         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv)
 870                            & perm_attr.wr_perm
 871                            | (r0.req.load & perm_attr.rd_perm)
 872                           )
 873         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 874         # Combine the request and cache hit status to decide what
 875         # operation needs to be done
 876         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 877         comb += op.eq(Op.OP_NONE)
 878         with m.If(go):
 879             with m.If(~access_ok):
 880                 comb += op.eq(Op.OP_BAD)
 881             with m.Elif(cancel_store):
 882                 comb += op.eq(Op.OP_STCX_FAIL)
 883             with m.Else():
 884                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 885                 with m.Switch(opsel):
 886                     with m.Case(0b101):
 887                         comb += op.eq(Op.OP_LOAD_HIT)
 888                     with m.Case(0b100):
 889                         comb += op.eq(Op.OP_LOAD_MISS)
 890                     with m.Case(0b110):
 891                         comb += op.eq(Op.OP_LOAD_NC)
 892                     with m.Case(0b001):
 893                         comb += op.eq(Op.OP_STORE_HIT)
 894                     with m.Case(0b000):
 895                         comb += op.eq(Op.OP_STORE_MISS)
 896                     with m.Case(0b010):
 897                         comb += op.eq(Op.OP_STORE_MISS)
 898                     with m.Case(0b011):
 899                         comb += op.eq(Op.OP_BAD)
 900                     with m.Case(0b111):
 901                         comb += op.eq(Op.OP_BAD)
 902                     with m.Default():
 903                         comb += op.eq(Op.OP_NONE)
 904         comb += req_op.eq(op)
 905         comb += req_go.eq(go)
 906
 907         # Version of the row number that is valid one cycle earlier
 908         # in the cases where we need to read the cache data BRAM.
 909         # If we're stalling then we need to keep reading the last
 910         # row requested.
 911         with m.If(~r0_stall):
 912             with m.If(m_in.valid):
 913                 comb += early_req_row.eq(get_row(m_in.addr))
 914             with m.Else():
 915                 comb += early_req_row.eq(get_row(d_in.addr))
 916         with m.Else():
 917             comb += early_req_row.eq(req_row)
 918
 919     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 920                          r0_valid, r0, reservation):
 921         """Handle load-with-reservation and store-conditional instructions
 922         """
 923         comb = m.d.comb
 924         sync = m.d.sync
 925
 926         with m.If(r0_valid & r0.req.reserve):
 927
 928             # XXX generate alignment interrupt if address
 929             # is not aligned XXX or if r0.req.nc = '1'
 930             with m.If(r0.req.load):
 931                 comb += set_rsrv.eq(1) # load with reservation
 932             with m.Else():
 933                 comb += clear_rsrv.eq(1) # store conditional
 934                 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
 935                     comb += cancel_store.eq(1)
 936
 937     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 938                         reservation, r0):
 939
 940         comb = m.d.comb
 941         sync = m.d.sync
 942
 943         with m.If(r0_valid & access_ok):
 944             with m.If(clear_rsrv):
 945                 sync += reservation.valid.eq(0)
 946             with m.Elif(set_rsrv):
 947                 sync += reservation.valid.eq(1)
 948                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 949
 950     def writeback_control(self, m, r1, cache_out):
 951         """Return data for loads & completion control logic
 952         """
 953         comb = m.d.comb
 954         sync = m.d.sync
 955         d_out, m_out = self.d_out, self.m_out
 956
 957         data_out = Signal(64)
 958         data_fwd = Signal(64)
 959
 960         # Use the bypass if are reading the row that was
 961         # written 1 or 2 cycles ago, including for the
 962         # slow_valid = 1 case (i.e. completing a load
 963         # miss or a non-cacheable load).
 964         with m.If(r1.use_forward1):
 965             comb += data_fwd.eq(r1.forward_data1)
 966         with m.Else():
 967             comb += data_fwd.eq(r1.forward_data2)
 968
 969         comb += data_out.eq(cache_out[r1.hit_way])
 970
 971         for i in range(8):
 972             with m.If(r1.forward_sel[i]):
 973                 dsel = data_fwd.word_select(i, 8)
 974                 comb += data_out.word_select(i, 8).eq(dsel)
 975
 976         comb += d_out.valid.eq(r1.ls_valid)
 977         comb += d_out.data.eq(data_out)
 978         comb += d_out.store_done.eq(~r1.stcx_fail)
 979         comb += d_out.error.eq(r1.ls_error)
 980         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 981
 982         # Outputs to MMU
 983         comb += m_out.done.eq(r1.mmu_done)
 984         comb += m_out.err.eq(r1.mmu_error)
 985         comb += m_out.data.eq(data_out)
 986
 987         # We have a valid load or store hit or we just completed
 988         # a slow op such as a load miss, a NC load or a store
 989         #
 990         # Note: the load hit is delayed by one cycle. However it
 991         # can still not collide with r.slow_valid (well unless I
 992         # miscalculated) because slow_valid can only be set on a
 993         # subsequent request and not on its first cycle (the state
 994         # machine must have advanced), which makes slow_valid
 995         # at least 2 cycles from the previous hit_load_valid.
 996
 997         # Sanity: Only one of these must be set in any given cycle
 998
 999         if False: # TODO: need Display to get this to work
1000             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1001             "unexpected slow_valid collision with stcx_fail"
1002
1003             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1004              "unexpected hit_load_delayed collision with slow_valid"
1005
1006         with m.If(~r1.mmu_req):
1007             # Request came from loadstore1...
1008             # Load hit case is the standard path
1009             with m.If(r1.hit_load_valid):
1010                 #Display(f"completing load hit data={data_out}")
1011                 pass
1012
1013             # error cases complete without stalling
1014             with m.If(r1.ls_error):
1015                 # Display("completing ld/st with error")
1016                 pass
1017
1018             # Slow ops (load miss, NC, stores)
1019             with m.If(r1.slow_valid):
1020                 #Display(f"completing store or load miss data={data_out}")
1021                 pass
1022
1023         with m.Else():
1024             # Request came from MMU
1025             with m.If(r1.hit_load_valid):
1026                 # Display(f"completing load hit to MMU, data={m_out.data}")
1027                 pass
1028             # error cases complete without stalling
1029             with m.If(r1.mmu_error):
1030                 #Display("combpleting MMU ld with error")
1031                 pass
1032
1033             # Slow ops (i.e. load miss)
1034             with m.If(r1.slow_valid):
1035                 #Display("completing MMU load miss, data={m_out.data}")
1036                 pass
1037
1038     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1039         """rams
1040         Generate a cache RAM for each way. This handles the normal
1041         reads, writes from reloads and the special store-hit update
1042         path as well.
1043
1044         Note: the BRAMs have an extra read buffer, meaning the output
1045         is pipelined an extra cycle. This differs from the
1046         icache. The writeback logic needs to take that into
1047         account by using 1-cycle delayed signals for load hits.
1048         """
1049         comb = m.d.comb
1050         wb_in = self.wb_in
1051
1052         for i in range(NUM_WAYS):
1053             do_read  = Signal()
1054             rd_addr  = Signal(ROW_BITS)
1055             do_write = Signal()
1056             wr_addr  = Signal(ROW_BITS)
1057             wr_data  = Signal(WB_DATA_BITS)
1058             wr_sel   = Signal(ROW_SIZE)
1059             wr_sel_m = Signal(ROW_SIZE)
1060             _d_out   = Signal(WB_DATA_BITS)
1061
1062             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1063             setattr(m.submodules, "cacheram_%d" % i, way)
1064
1065             comb += way.rd_en.eq(do_read)
1066             comb += way.rd_addr.eq(rd_addr)
1067             comb += _d_out.eq(way.rd_data_o)
1068             comb += way.wr_sel.eq(wr_sel_m)
1069             comb += way.wr_addr.eq(wr_addr)
1070             comb += way.wr_data.eq(wr_data)
1071
1072             # Cache hit reads
1073             comb += do_read.eq(1)
1074             comb += rd_addr.eq(early_req_row)
1075             comb += cache_out[i].eq(_d_out)
1076
1077             # Write mux:
1078             #
1079             # Defaults to wishbone read responses (cache refill)
1080             #
1081             # For timing, the mux on wr_data/sel/addr is not
1082             # dependent on anything other than the current state.
1083
1084             with m.If(r1.write_bram):
1085                 # Write store data to BRAM.  This happens one
1086                 # cycle after the store is in r0.
1087                 comb += wr_data.eq(r1.req.data)
1088                 comb += wr_sel.eq(r1.req.byte_sel)
1089                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1090
1091                 with m.If(i == r1.req.hit_way):
1092                     comb += do_write.eq(1)
1093             with m.Else():
1094                 # Otherwise, we might be doing a reload or a DCBZ
1095                 with m.If(r1.dcbz):
1096                     comb += wr_data.eq(0)
1097                 with m.Else():
1098                     comb += wr_data.eq(wb_in.dat)
1099                 comb += wr_addr.eq(r1.store_row)
1100                 comb += wr_sel.eq(~0) # all 1s
1101
1102             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1103                       & wb_in.ack & (replace_way == i)):
1104                 comb += do_write.eq(1)
1105
1106                 # Mask write selects with do_write since BRAM
1107                 # doesn't have a global write-enable
1108                 with m.If(do_write):
1109                     comb += wr_sel_m.eq(wr_sel)
1110
1111     # Cache hit synchronous machine for the easy case.
1112     # This handles load hits.
1113     # It also handles error cases (TLB miss, cache paradox)
1114     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1115                         req_hit_way, req_index, access_ok,
1116                         tlb_hit, tlb_hit_way, tlb_req_index):
1117
1118         comb = m.d.comb
1119         sync = m.d.sync
1120
1121         with m.If(req_op != Op.OP_NONE):
1122             #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
1123             #      f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
1124             #     )
1125             pass
1126
1127         with m.If(r0_valid):
1128             sync += r1.mmu_req.eq(r0.mmu_req)
1129
1130         # Fast path for load/store hits.
1131         # Set signals for the writeback controls.
1132         sync += r1.hit_way.eq(req_hit_way)
1133         sync += r1.hit_index.eq(req_index)
1134
1135         with m.If(req_op == Op.OP_LOAD_HIT):
1136             sync += r1.hit_load_valid.eq(1)
1137         with m.Else():
1138             sync += r1.hit_load_valid.eq(0)
1139
1140         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1141             sync += r1.cache_hit.eq(1)
1142         with m.Else():
1143             sync += r1.cache_hit.eq(0)
1144
1145         with m.If(req_op == Op.OP_BAD):
1146             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1147             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1148             sync += r1.ls_error.eq(~r0.mmu_req)
1149             sync += r1.mmu_error.eq(r0.mmu_req)
1150             sync += r1.cache_paradox.eq(access_ok)
1151
1152             with m.Else():
1153                 sync += r1.ls_error.eq(0)
1154                 sync += r1.mmu_error.eq(0)
1155                 sync += r1.cache_paradox.eq(0)
1156
1157         with m.If(req_op == Op.OP_STCX_FAIL):
1158             r1.stcx_fail.eq(1)
1159         with m.Else():
1160             sync += r1.stcx_fail.eq(0)
1161
1162         # Record TLB hit information for updating TLB PLRU
1163         sync += r1.tlb_hit.eq(tlb_hit)
1164         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1165         sync += r1.tlb_hit_index.eq(tlb_req_index)
1166
1167     # Memory accesses are handled by this state machine:
1168     #
1169     #   * Cache load miss/reload (in conjunction with "rams")
1170     #   * Load hits for non-cachable forms
1171     #   * Stores (the collision case is handled in "rams")
1172     #
1173     # All wishbone requests generation is done here.
1174     # This machine operates at stage 1.
1175     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1176                     cache_valid_bits, r0, replace_way,
1177                     req_hit_way, req_same_tag,
1178                     r0_valid, req_op, cache_tag, req_go, ra):
1179
1180         comb = m.d.comb
1181         sync = m.d.sync
1182         wb_in = self.wb_in
1183
1184         req         = MemAccessRequest()
1185         acks        = Signal(3)
1186         adjust_acks = Signal(3)
1187         stbs_done = Signal()
1188
1189         sync += r1.use_forward1.eq(use_forward1_next)
1190         sync += r1.forward_sel.eq(0)
1191
1192         with m.If(use_forward1_next):
1193             sync += r1.forward_sel.eq(r1.req.byte_sel)
1194         with m.Elif(use_forward2_next):
1195             sync += r1.forward_sel.eq(r1.forward_sel1)
1196
1197         sync += r1.forward_data2.eq(r1.forward_data1)
1198         with m.If(r1.write_bram):
1199             sync += r1.forward_data1.eq(r1.req.data)
1200             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1201             sync += r1.forward_way1.eq(r1.req.hit_way)
1202             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1203             sync += r1.forward_valid1.eq(1)
1204         with m.Else():
1205             with m.If(r1.dcbz):
1206                 sync += r1.forward_data1.eq(0)
1207             with m.Else():
1208                 sync += r1.forward_data1.eq(wb_in.dat)
1209             sync += r1.forward_sel1.eq(~0) # all 1s
1210             sync += r1.forward_way1.eq(replace_way)
1211             sync += r1.forward_row1.eq(r1.store_row)
1212             sync += r1.forward_valid1.eq(0)
1213
1214         # One cycle pulses reset
1215         sync += r1.slow_valid.eq(0)
1216         sync += r1.write_bram.eq(0)
1217         sync += r1.inc_acks.eq(0)
1218         sync += r1.dec_acks.eq(0)
1219
1220         sync += r1.ls_valid.eq(0)
1221         # complete tlbies and TLB loads in the third cycle
1222         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1223
1224         with m.If((req_op == Op.OP_LOAD_HIT)
1225                   | (req_op == Op.OP_STCX_FAIL)):
1226             with m.If(~r0.mmu_req):
1227                 sync += r1.ls_valid.eq(1)
1228             with m.Else():
1229                 sync += r1.mmu_done.eq(1)
1230
1231         with m.If(r1.write_tag):
1232             # Store new tag in selected way
1233             for i in range(NUM_WAYS):
1234                 with m.If(i == replace_way):
1235                     ct = Signal(TAG_RAM_WIDTH)
1236                     comb += ct.eq(cache_tag[r1.store_index])
1237                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1238                     sync += cache_tag[r1.store_index].eq(ct)
1239             sync += r1.store_way.eq(replace_way)
1240             sync += r1.write_tag.eq(0)
1241
1242         # Take request from r1.req if there is one there,
1243         # else from req_op, ra, etc.
1244         with m.If(r1.full):
1245             comb += req.eq(r1.req)
1246         with m.Else():
1247             comb += req.op.eq(req_op)
1248             comb += req.valid.eq(req_go)
1249             comb += req.mmu_req.eq(r0.mmu_req)
1250             comb += req.dcbz.eq(r0.req.dcbz)
1251             comb += req.real_addr.eq(ra)
1252
1253             with m.If(~r0.req.dcbz):
1254                 comb += req.data.eq(r0.req.data)
1255             with m.Else():
1256                 comb += req.data.eq(0)
1257
1258             # Select all bytes for dcbz
1259             # and for cacheable loads
1260             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1261                 comb += req.byte_sel.eq(~0) # all 1s
1262             with m.Else():
1263                 comb += req.byte_sel.eq(r0.req.byte_sel)
1264             comb += req.hit_way.eq(req_hit_way)
1265             comb += req.same_tag.eq(req_same_tag)
1266
1267             # Store the incoming request from r0,
1268             # if it is a slow request
1269             # Note that r1.full = 1 implies req_op = OP_NONE
1270             with m.If((req_op == Op.OP_LOAD_MISS)
1271                       | (req_op == Op.OP_LOAD_NC)
1272                       | (req_op == Op.OP_STORE_MISS)
1273                       | (req_op == Op.OP_STORE_HIT)):
1274                 sync += r1.req.eq(req)
1275                 sync += r1.full.eq(1)
1276
1277         # Main state machine
1278         with m.Switch(r1.state):
1279
1280             with m.Case(State.IDLE):
1281 # XXX check 'left downto.  probably means len(r1.wb.adr)
1282 #                     r1.wb.adr <= req.real_addr(
1283 #                                   r1.wb.adr'left downto 0
1284 #                                  );
1285                 sync += r1.wb.adr.eq(req.real_addr)
1286                 sync += r1.wb.sel.eq(req.byte_sel)
1287                 sync += r1.wb.dat.eq(req.data)
1288                 sync += r1.dcbz.eq(req.dcbz)
1289
1290                 # Keep track of our index and way
1291                 # for subsequent stores.
1292                 sync += r1.store_index.eq(get_index(req.real_addr))
1293                 sync += r1.store_row.eq(get_row(req.real_addr))
1294                 sync += r1.end_row_ix.eq(
1295                          get_row_of_line(get_row(req.real_addr))
1296                         )
1297                 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1298                 sync += r1.req.same_tag.eq(1)
1299
1300                 with m.If(req.op == Op.OP_STORE_HIT):
1301                     sync += r1.store_way.eq(req.hit_way)
1302
1303                 # Reset per-row valid bits,
1304                 # ready for handling OP_LOAD_MISS
1305                 for i in range(ROW_PER_LINE):
1306                     sync += r1.rows_valid[i].eq(0)
1307
1308                 with m.Switch(req.op):
1309                     with m.Case(Op.OP_LOAD_HIT):
1310                         # stay in IDLE state
1311                         pass
1312
1313                     with m.Case(Op.OP_LOAD_MISS):
1314                         #Display(f"cache miss real addr:" \
1315                         #      f"{req_real_addr}" \
1316                         #      f" idx:{get_index(req_real_addr)}" \
1317                         #      f" tag:{get_tag(req.real_addr)}")
1318                         pass
1319
1320                         # Start the wishbone cycle
1321                         sync += r1.wb.we.eq(0)
1322                         sync += r1.wb.cyc.eq(1)
1323                         sync += r1.wb.stb.eq(1)
1324
1325                         # Track that we had one request sent
1326                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1327                         sync += r1.write_tag.eq(1)
1328
1329                     with m.Case(Op.OP_LOAD_NC):
1330                         sync += r1.wb.cyc.eq(1)
1331                         sync += r1.wb.stb.eq(1)
1332                         sync += r1.wb.we.eq(0)
1333                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1334
1335                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1336                         with m.If(~req.dcbz):
1337                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1338                             sync += r1.acks_pending.eq(1)
1339                             sync += r1.full.eq(0)
1340                             sync += r1.slow_valid.eq(1)
1341
1342                             with m.If(~req.mmu_req):
1343                                 sync += r1.ls_valid.eq(1)
1344                             with m.Else():
1345                                 sync += r1.mmu_done.eq(1)
1346
1347                             with m.If(req.op == Op.OP_STORE_HIT):
1348                                 sync += r1.write_bram.eq(1)
1349                         with m.Else():
1350                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1351
1352                             with m.If(req.op == Op.OP_STORE_MISS):
1353                                 sync += r1.write_tag.eq(1)
1354
1355                         sync += r1.wb.we.eq(1)
1356                         sync += r1.wb.cyc.eq(1)
1357                         sync += r1.wb.stb.eq(1)
1358
1359                     # OP_NONE and OP_BAD do nothing
1360                     # OP_BAD & OP_STCX_FAIL were
1361                     # handled above already
1362                     with m.Case(Op.OP_NONE):
1363                         pass
1364                     with m.Case(Op.OP_BAD):
1365                         pass
1366                     with m.Case(Op.OP_STCX_FAIL):
1367                         pass
1368
1369             with m.Case(State.RELOAD_WAIT_ACK):
1370                 # Requests are all sent if stb is 0
1371                 comb += stbs_done.eq(~r1.wb.stb)
1372
1373                 with m.If(~wb_in.stall & ~stbs_done):
1374                     # That was the last word?
1375                     # We are done sending.
1376                     # Clear stb and set stbs_done
1377                     # so we can handle an eventual
1378                     # last ack on the same cycle.
1379                     with m.If(is_last_row_addr(
1380                               r1.wb.adr, r1.end_row_ix)):
1381                         sync += r1.wb.stb.eq(0)
1382                         comb += stbs_done.eq(0)
1383
1384                     # Calculate the next row address in the current cache line
1385                     rarange = r1.wb.adr[ROW_OFF_BITS : LINE_OFF_BITS]
1386                     sync += rarange.eq(rarange + 1)
1387
1388                 # Incoming acks processing
1389                 sync += r1.forward_valid1.eq(wb_in.ack)
1390                 with m.If(wb_in.ack):
1391                     # XXX needs an Array bit-accessor here
1392                     sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1393
1394                     # If this is the data we were looking for,
1395                     # we can complete the request next cycle.
1396                     # Compare the whole address in case the
1397                     # request in r1.req is not the one that
1398                     # started this refill.
1399                     with m.If(r1.full & r1.req.same_tag &
1400                               ((r1.dcbz & r1.req.dcbz) |
1401                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1402                                 (r1.store_row == get_row(r1.req.real_addr))):
1403                         sync += r1.full.eq(0)
1404                         sync += r1.slow_valid.eq(1)
1405                         with m.If(~r1.mmu_req):
1406                             sync += r1.ls_valid.eq(1)
1407                         with m.Else():
1408                             sync += r1.mmu_done.eq(1)
1409                         sync += r1.forward_sel.eq(~0) # all 1s
1410                         sync += r1.use_forward1.eq(1)
1411
1412                     # Check for completion
1413                     with m.If(stbs_done & is_last_row(r1.store_row,
1414                                                       r1.end_row_ix)):
1415                         # Complete wishbone cycle
1416                         sync += r1.wb.cyc.eq(0)
1417
1418                         # Cache line is now valid
1419                         cv = Signal(INDEX_BITS)
1420                         sync += cv.eq(cache_valid_bits[r1.store_index])
1421                         sync += cv.bit_select(r1.store_way, 1).eq(1)
1422                         sync += r1.state.eq(State.IDLE)
1423
1424                     # Increment store row counter
1425                     sync += r1.store_row.eq(next_row(r1.store_row))
1426
1427             with m.Case(State.STORE_WAIT_ACK):
1428                 comb += stbs_done.eq(~r1.wb.stb)
1429                 comb += acks.eq(r1.acks_pending)
1430
1431                 with m.If(r1.inc_acks != r1.dec_acks):
1432                     with m.If(r1.inc_acks):
1433                         comb += adjust_acks.eq(acks + 1)
1434                     with m.Else():
1435                         comb += adjust_acks.eq(acks - 1)
1436                 with m.Else():
1437                     comb += adjust_acks.eq(acks)
1438
1439                 sync += r1.acks_pending.eq(adjust_acks)
1440
1441                 # Clear stb when slave accepted request
1442                 with m.If(~wb_in.stall):
1443                     # See if there is another store waiting
1444                     # to be done which is in the same real page.
1445                     with m.If(req.valid):
1446                         ra = req.real_addr[0:SET_SIZE_BITS]
1447                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1448                         sync += r1.wb.dat.eq(req.data)
1449                         sync += r1.wb.sel.eq(req.byte_sel)
1450
1451                     with m.Elif((adjust_acks < 7) & req.same_tag &
1452                                 ((req.op == Op.OP_STORE_MISS)
1453                                  | (req.op == Op.OP_STORE_HIT))):
1454                         sync += r1.wb.stb.eq(1)
1455                         comb += stbs_done.eq(0)
1456
1457                         with m.If(req.op == Op.OP_STORE_HIT):
1458                             sync += r1.write_bram.eq(1)
1459                         sync += r1.full.eq(0)
1460                         sync += r1.slow_valid.eq(1)
1461
1462                         # Store requests never come from the MMU
1463                         sync += r1.ls_valid.eq(1)
1464                         comb += stbs_done.eq(0)
1465                         sync += r1.inc_acks.eq(1)
1466                     with m.Else():
1467                         sync += r1.wb.stb.eq(0)
1468                         comb += stbs_done.eq(1)
1469
1470                 # Got ack ? See if complete.
1471                 with m.If(wb_in.ack):
1472                     with m.If(stbs_done & (adjust_acks == 1)):
1473                         sync += r1.state.eq(State.IDLE)
1474                         sync += r1.wb.cyc.eq(0)
1475                         sync += r1.wb.stb.eq(0)
1476                     sync += r1.dec_acks.eq(1)
1477
1478             with m.Case(State.NC_LOAD_WAIT_ACK):
1479                 # Clear stb when slave accepted request
1480                 with m.If(~wb_in.stall):
1481                     sync += r1.wb.stb.eq(0)
1482
1483                 # Got ack ? complete.
1484                 with m.If(wb_in.ack):
1485                     sync += r1.state.eq(State.IDLE)
1486                     sync += r1.full.eq(0)
1487                     sync += r1.slow_valid.eq(1)
1488
1489                     with m.If(~r1.mmu_req):
1490                         sync += r1.ls_valid.eq(1)
1491                     with m.Else():
1492                         sync += r1.mmu_done.eq(1)
1493
1494                     sync += r1.forward_sel.eq(~0) # all 1s
1495                     sync += r1.use_forward1.eq(1)
1496                     sync += r1.wb.cyc.eq(0)
1497                     sync += r1.wb.stb.eq(0)
1498
1499     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1500
1501         sync = m.d.sync
1502         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1503
1504         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1505                                stall_out, req_op[:3], d_out.valid, d_out.error,
1506                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1507                                r1.wb.adr[3:6]))
1508
1509     def elaborate(self, platform):
1510
1511         m = Module()
1512         comb = m.d.comb
1513
1514         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1515         cache_tags       = CacheTagArray()
1516         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1517         cache_valid_bits = CacheValidBitsArray()
1518
1519         # TODO attribute ram_style : string;
1520         # TODO attribute ram_style of cache_tags : signal is "distributed";
1521
1522         """note: these are passed to nmigen.hdl.Memory as "attributes".
1523            don't know how, just that they are.
1524         """
1525         dtlb_valid_bits = TLBValidBitsArray()
1526         dtlb_tags       = TLBTagsArray()
1527         dtlb_ptes       = TLBPtesArray()
1528         # TODO attribute ram_style of
1529         #  dtlb_tags : signal is "distributed";
1530         # TODO attribute ram_style of
1531         #  dtlb_ptes : signal is "distributed";
1532
1533         r0      = RegStage0()
1534         r0_full = Signal()
1535
1536         r1 = RegStage1()
1537
1538         reservation = Reservation()
1539
1540         # Async signals on incoming request
1541         req_index    = Signal(INDEX_BITS)
1542         req_row      = Signal(ROW_BITS)
1543         req_hit_way  = Signal(WAY_BITS)
1544         req_tag      = Signal(TAG_BITS)
1545         req_op       = Signal(Op)
1546         req_data     = Signal(64)
1547         req_same_tag = Signal()
1548         req_go       = Signal()
1549
1550         early_req_row     = Signal(ROW_BITS)
1551
1552         cancel_store      = Signal()
1553         set_rsrv          = Signal()
1554         clear_rsrv        = Signal()
1555
1556         r0_valid          = Signal()
1557         r0_stall          = Signal()
1558
1559         use_forward1_next = Signal()
1560         use_forward2_next = Signal()
1561
1562         cache_out         = CacheRamOut()
1563
1564         plru_victim       = PLRUOut()
1565         replace_way       = Signal(WAY_BITS)
1566
1567         # Wishbone read/write/cache write formatting signals
1568         bus_sel           = Signal(8)
1569
1570         # TLB signals
1571         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1572         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1573         tlb_valid_way = Signal(TLB_NUM_WAYS)
1574         tlb_req_index = Signal(TLB_SET_BITS)
1575         tlb_hit       = Signal()
1576         tlb_hit_way   = Signal(TLB_WAY_BITS)
1577         pte           = Signal(TLB_PTE_BITS)
1578         ra            = Signal(REAL_ADDR_BITS)
1579         valid_ra      = Signal()
1580         perm_attr     = PermAttr()
1581         rc_ok         = Signal()
1582         perm_ok       = Signal()
1583         access_ok     = Signal()
1584
1585         tlb_plru_victim = TLBPLRUOut()
1586
1587         # we don't yet handle collisions between loadstore1 requests
1588         # and MMU requests
1589         comb += self.m_out.stall.eq(0)
1590
1591         # Hold off the request in r0 when r1 has an uncompleted request
1592         comb += r0_stall.eq(r0_full & r1.full)
1593         comb += r0_valid.eq(r0_full & ~r1.full)
1594         comb += self.stall_out.eq(r0_stall)
1595
1596         # Wire up wishbone request latch out of stage 1
1597         comb += self.wb_out.eq(r1.wb)
1598
1599         # call sub-functions putting everything together, using shared
1600         # signals established above
1601         self.stage_0(m, r0, r1, r0_full)
1602         self.tlb_read(m, r0_stall, tlb_valid_way,
1603                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1604                       dtlb_tags, dtlb_ptes)
1605         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1606                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1607                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1608         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1609                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1610                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1611         self.maybe_plrus(m, r1, plru_victim)
1612         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1613         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1614         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1615                            r0_valid, r1, cache_valid_bits, replace_way,
1616                            use_forward1_next, use_forward2_next,
1617                            req_hit_way, plru_victim, rc_ok, perm_attr,
1618                            valid_ra, perm_ok, access_ok, req_op, req_go,
1619                            tlb_pte_way,
1620                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1621                            cancel_store, req_same_tag, r0_stall, early_req_row)
1622         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1623                            r0_valid, r0, reservation)
1624         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1625                            reservation, r0)
1626         self.writeback_control(m, r1, cache_out)
1627         self.rams(m, r1, early_req_row, cache_out, replace_way)
1628         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1629                         req_hit_way, req_index, access_ok,
1630                         tlb_hit, tlb_hit_way, tlb_req_index)
1631         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1632                     cache_valid_bits, r0, replace_way,
1633                     req_hit_way, req_same_tag,
1634                          r0_valid, req_op, cache_tags, req_go, ra)
1635         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1636
1637         return m
1638
1639
1640 # dcache_tb.vhdl
1641 #
1642 # entity dcache_tb is
1643 # end dcache_tb;
1644 #
1645 # architecture behave of dcache_tb is
1646 #     signal clk          : std_ulogic;
1647 #     signal rst          : std_ulogic;
1648 #
1649 #     signal d_in         : Loadstore1ToDcacheType;
1650 #     signal d_out        : DcacheToLoadstore1Type;
1651 #
1652 #     signal m_in         : MmuToDcacheType;
1653 #     signal m_out        : DcacheToMmuType;
1654 #
1655 #     signal wb_bram_in   : wishbone_master_out;
1656 #     signal wb_bram_out  : wishbone_slave_out;
1657 #
1658 #     constant clk_period : time := 10 ns;
1659 # begin
1660 #     dcache0: entity work.dcache
1661 #         generic map(
1662 #
1663 #             LINE_SIZE => 64,
1664 #             NUM_LINES => 4
1665 #             )
1666 #         port map(
1667 #             clk => clk,
1668 #             rst => rst,
1669 #             d_in => d_in,
1670 #             d_out => d_out,
1671 #             m_in => m_in,
1672 #             m_out => m_out,
1673 #             wishbone_out => wb_bram_in,
1674 #             wishbone_in => wb_bram_out
1675 #             );
1676 #
1677 #     -- BRAM Memory slave
1678 #     bram0: entity work.wishbone_bram_wrapper
1679 #         generic map(
1680 #             MEMORY_SIZE   => 1024,
1681 #             RAM_INIT_FILE => "icache_test.bin"
1682 #             )
1683 #         port map(
1684 #             clk => clk,
1685 #             rst => rst,
1686 #             wishbone_in => wb_bram_in,
1687 #             wishbone_out => wb_bram_out
1688 #             );
1689 #
1690 #     clk_process: process
1691 #     begin
1692 #         clk <= '0';
1693 #         wait for clk_period/2;
1694 #         clk <= '1';
1695 #         wait for clk_period/2;
1696 #     end process;
1697 #
1698 #     rst_process: process
1699 #     begin
1700 #         rst <= '1';
1701 #         wait for 2*clk_period;
1702 #         rst <= '0';
1703 #         wait;
1704 #     end process;
1705 #
1706 #     stim: process
1707 #     begin
1708 #     -- Clear stuff
1709 #     d_in.valid <= '0';
1710 #     d_in.load <= '0';
1711 #     d_in.nc <= '0';
1712 #     d_in.addr <= (others => '0');
1713 #     d_in.data <= (others => '0');
1714 #         m_in.valid <= '0';
1715 #         m_in.addr <= (others => '0');
1716 #         m_in.pte <= (others => '0');
1717 #
1718 #         wait for 4*clk_period;
1719 #     wait until rising_edge(clk);
1720 #
1721 #     -- Cacheable read of address 4
1722 #     d_in.load <= '1';
1723 #     d_in.nc <= '0';
1724 #         d_in.addr <= x"0000000000000004";
1725 #         d_in.valid <= '1';
1726 #     wait until rising_edge(clk);
1727 #         d_in.valid <= '0';
1728 #
1729 #     wait until rising_edge(clk) and d_out.valid = '1';
1730 #         assert d_out.data = x"0000000100000000"
1731 #         report "data @" & to_hstring(d_in.addr) &
1732 #         "=" & to_hstring(d_out.data) &
1733 #         " expected 0000000100000000"
1734 #         severity failure;
1735 # --      wait for clk_period;
1736 #
1737 #     -- Cacheable read of address 30
1738 #     d_in.load <= '1';
1739 #     d_in.nc <= '0';
1740 #         d_in.addr <= x"0000000000000030";
1741 #         d_in.valid <= '1';
1742 #     wait until rising_edge(clk);
1743 #         d_in.valid <= '0';
1744 #
1745 #     wait until rising_edge(clk) and d_out.valid = '1';
1746 #         assert d_out.data = x"0000000D0000000C"
1747 #         report "data @" & to_hstring(d_in.addr) &
1748 #         "=" & to_hstring(d_out.data) &
1749 #         " expected 0000000D0000000C"
1750 #         severity failure;
1751 #
1752 #     -- Non-cacheable read of address 100
1753 #     d_in.load <= '1';
1754 #     d_in.nc <= '1';
1755 #         d_in.addr <= x"0000000000000100";
1756 #         d_in.valid <= '1';
1757 #     wait until rising_edge(clk);
1758 #     d_in.valid <= '0';
1759 #     wait until rising_edge(clk) and d_out.valid = '1';
1760 #         assert d_out.data = x"0000004100000040"
1761 #         report "data @" & to_hstring(d_in.addr) &
1762 #         "=" & to_hstring(d_out.data) &
1763 #         " expected 0000004100000040"
1764 #         severity failure;
1765 #
1766 #     wait until rising_edge(clk);
1767 #     wait until rising_edge(clk);
1768 #     wait until rising_edge(clk);
1769 #     wait until rising_edge(clk);
1770 #
1771 #     std.env.finish;
1772 #     end process;
1773 # end;
1774 def dcache_sim(dut):
1775     # clear stuff
1776     yield dut.d_in.valid.eq(0)
1777     yield dut.d_in.load.eq(0)
1778     yield dut.d_in.nc.eq(0)
1779     yield dut.d_in.addr.eq(0)
1780     yield dut.d_in.data.eq(0)
1781     yield dut.m_in.valid.eq(0)
1782     yield dut.m_in.addr.eq(0)
1783     yield dut.m_in.pte.eq(0)
1784     # wait 4 * clk_period
1785     yield
1786     yield
1787     yield
1788     yield
1789
1790     # Cacheable read of address 4
1791     yield dut.d_in.load.eq(1)
1792     yield dut.d_in.nc.eq(0)
1793     yield dut.d_in.addr.eq(0x0000000000000004)
1794     yield dut.d_in.valid.eq(1)
1795     yield
1796     yield dut.d_in.valid.eq(0)
1797     yield
1798     while not (yield dut.d_out.valid):
1799         yield
1800     data = yield dut.d_out.data
1801     addr = yield dut.d_in.addr
1802     assert data == 0x0000000100000000, \
1803         f"data @%x=%x expected 0x0000000100000000" % (data, addr)
1804
1805     # Cacheable read of address 30
1806     yield dut.d_in.load.eq(1)
1807     yield dut.d_in.nc.eq(0)
1808     yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
1809     yield dut.d_in.valid.eq(1)
1810     yield
1811     yield dut.d_in.valid.eq(0)
1812     yield
1813     while not (yield dut.d_out.valid):
1814         yield
1815     data = yield dut.d_out.data
1816     addr = yield dut.d_in.addr
1817     assert data == 0x0000000D0000000C, \
1818         f"data @%x=%x expected 0000000D0000000C" % (data, addr)
1819
1820     # Non-cacheable read of address 100
1821     yield dut.d_in.load.eq(1)
1822     yield dut.d_in.nc.eq(1)
1823     yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
1824     yield dut.d_in.valid.eq(1)
1825     yield
1826     yield dut.d_in.valid.eq(0)
1827     yield
1828     while not (yield dut.d_out.valid):
1829         yield
1830     data = yield dut.d_out.data
1831     addr = yield dut.d_in.addr
1832     assert data == 0x0000004100000040, \
1833         f"data @%x=%x expected 0000004100000040" % (data, addr)
1834
1835     yield
1836     yield
1837     yield
1838     yield
1839
1840
1841 def test_dcache():
1842     dut = DCache()
1843     vl = rtlil.convert(dut, ports=[])
1844     with open("test_dcache.il", "w") as f:
1845         f.write(vl)
1846
1847     m = Module()
1848     m.submodules.dcache = dut
1849
1850     # nmigen Simulation
1851     sim = Simulator(m)
1852     sim.add_clock(1e-6)
1853
1854     sim.add_sync_process(wrap(dcache_sim(dut)))
1855     with sim.write_vcd('test_dcache.vcd'):
1856         sim.run()
1857
1858 if __name__ == '__main__':
1859     test_dcache()
1860