src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 try:
  11     from nmigen.hdl.ast import Display
  12 except ImportError:
  13     def Display(*args):
  14         return []
  15
  16 from random import randint
  17
  18 from nmigen.cli import main
  19 from nmutil.iocontrol import RecordObject
  20 from nmutil.util import wrap
  21 from nmigen.utils import log2_int
  22 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  23                                      DCacheToLoadStore1Type,
  24                                      MMUToDCacheType,
  25                                      DCacheToMMUType)
  26
  27 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  28                                 WBAddrType, WBDataType, WBSelType,
  29                                 WBMasterOut, WBSlaveOut,
  30                                 WBMasterOutVector, WBSlaveOutVector,
  31                                 WBIOMasterOut, WBIOSlaveOut)
  32
  33 from soc.experiment.cache_ram import CacheRam
  34 from soc.experiment.plru import PLRU
  35
  36 # for test
  37 from nmigen_soc.wishbone.sram import SRAM
  38 from nmigen import Memory
  39 from nmigen.cli import rtlil
  40 if True:
  41     from nmigen.back.pysim import Simulator, Delay, Settle
  42 else:
  43     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  44
  45
  46 # TODO: make these parameters of DCache at some point
  47 LINE_SIZE = 64    # Line size in bytes
  48 NUM_LINES = 16    # Number of lines in a set
  49 NUM_WAYS = 4      # Number of ways
  50 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  51 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  52 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  53 LOG_LENGTH = 0    # Non-zero to enable log data collection
  54
  55 # BRAM organisation: We never access more than
  56 #     -- WB_DATA_BITS at a time so to save
  57 #     -- resources we make the array only that wide, and
  58 #     -- use consecutive indices for to make a cache "line"
  59 #     --
  60 #     -- ROW_SIZE is the width in bytes of the BRAM
  61 #     -- (based on WB, so 64-bits)
  62 ROW_SIZE = WB_DATA_BITS // 8;
  63
  64 # ROW_PER_LINE is the number of row (wishbone
  65 # transactions) in a line
  66 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  67
  68 # BRAM_ROWS is the number of rows in BRAM needed
  69 # to represent the full dcache
  70 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  71
  72
  73 # Bit fields counts in the address
  74
  75 # REAL_ADDR_BITS is the number of real address
  76 # bits that we store
  77 REAL_ADDR_BITS = 56
  78
  79 # ROW_BITS is the number of bits to select a row
  80 ROW_BITS = log2_int(BRAM_ROWS)
  81
  82 # ROW_LINE_BITS is the number of bits to select
  83 # a row within a line
  84 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  85
  86 # LINE_OFF_BITS is the number of bits for
  87 # the offset in a cache line
  88 LINE_OFF_BITS = log2_int(LINE_SIZE)
  89
  90 # ROW_OFF_BITS is the number of bits for
  91 # the offset in a row
  92 ROW_OFF_BITS = log2_int(ROW_SIZE)
  93
  94 # INDEX_BITS is the number if bits to
  95 # select a cache line
  96 INDEX_BITS = log2_int(NUM_LINES)
  97
  98 # SET_SIZE_BITS is the log base 2 of the set size
  99 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 100
 101 # TAG_BITS is the number of bits of
 102 # the tag part of the address
 103 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 104
 105 # TAG_WIDTH is the width in bits of each way of the tag RAM
 106 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 107
 108 # WAY_BITS is the number of bits to select a way
 109 WAY_BITS = log2_int(NUM_WAYS)
 110
 111 # Example of layout for 32 lines of 64 bytes:
 112 #
 113 # ..  tag    |index|  line  |
 114 # ..         |   row   |    |
 115 # ..         |     |---|    | ROW_LINE_BITS  (3)
 116 # ..         |     |--- - --| LINE_OFF_BITS (6)
 117 # ..         |         |- --| ROW_OFF_BITS  (3)
 118 # ..         |----- ---|    | ROW_BITS      (8)
 119 # ..         |-----|        | INDEX_BITS    (5)
 120 # .. --------|              | TAG_BITS      (45)
 121
 122 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 123
 124 def CacheTagArray():
 125     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 126                         for x in range(NUM_LINES))
 127
 128 def CacheValidBitsArray():
 129     return Array(Signal(INDEX_BITS, name="cachevalid_%d" % x) \
 130                         for x in range(NUM_LINES))
 131
 132 def RowPerLineValidArray():
 133     return Array(Signal(name="rows_valid%d" % x) \
 134                         for x in range(ROW_PER_LINE))
 135
 136 # L1 TLB
 137 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 138 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 139 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 140 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 141 TLB_PTE_BITS     = 64
 142 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 143
 144 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 145 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
 146 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
 147 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
 148 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 149 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 150         "geometry bits don't add up"
 151 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 152         "geometry bits don't add up"
 153 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 154          "geometry bits don't add up"
 155 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 156 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 157
 158
 159 def TLBValidBitsArray():
 160     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 161
 162 def TLBTagEAArray():
 163     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 164
 165 def TLBTagsArray():
 166     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 167
 168 def TLBPtesArray():
 169     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 170
 171 def HitWaySet():
 172     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 173                         for x in range(TLB_NUM_WAYS))
 174
 175 # Cache RAM interface
 176 def CacheRamOut():
 177     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 178                  for x in range(NUM_WAYS))
 179
 180 # PLRU output interface
 181 def PLRUOut():
 182     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 183
 184 # TLB PLRU output interface
 185 def TLBPLRUOut():
 186     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 187
 188 # Helper functions to decode incoming requests
 189 #
 190 # Return the cache line index (tag index) for an address
 191 def get_index(addr):
 192     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 193
 194 # Return the cache row index (data memory) for an address
 195 def get_row(addr):
 196     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 197
 198 # Return the index of a row within a line
 199 def get_row_of_line(row):
 200     return row[:ROW_LINE_BITS]
 201
 202 # Returns whether this is the last row of a line
 203 def is_last_row_addr(addr, last):
 204     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 205
 206 # Returns whether this is the last row of a line
 207 def is_last_row(row, last):
 208     return get_row_of_line(row) == last
 209
 210 # Return the next row in the current cache line. We use a
 211 # dedicated function in order to limit the size of the
 212 # generated adder to be only the bits within a cache line
 213 # (3 bits with default settings)
 214 def next_row(row):
 215     row_v = row[0:ROW_LINE_BITS] + 1
 216     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 217
 218 # Get the tag value from the address
 219 def get_tag(addr):
 220     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 221
 222 # Read a tag from a tag memory row
 223 def read_tag(way, tagset):
 224     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 225
 226 # Read a TLB tag from a TLB tag memory row
 227 def read_tlb_tag(way, tags):
 228     return tags.word_select(way, TLB_EA_TAG_BITS)
 229
 230 # Write a TLB tag to a TLB tag memory row
 231 def write_tlb_tag(way, tags, tag):
 232     return read_tlb_tag(way, tags).eq(tag)
 233
 234 # Read a PTE from a TLB PTE memory row
 235 def read_tlb_pte(way, ptes):
 236     return ptes.word_select(way, TLB_PTE_BITS)
 237
 238 def write_tlb_pte(way, ptes, newpte):
 239     return read_tlb_pte(way, ptes).eq(newpte)
 240
 241
 242 # Record for storing permission, attribute, etc. bits from a PTE
 243 class PermAttr(RecordObject):
 244     def __init__(self, name=None):
 245         super().__init__(name=name)
 246         self.reference = Signal()
 247         self.changed   = Signal()
 248         self.nocache   = Signal()
 249         self.priv      = Signal()
 250         self.rd_perm   = Signal()
 251         self.wr_perm   = Signal()
 252
 253
 254 def extract_perm_attr(pte):
 255     pa = PermAttr()
 256     pa.reference = pte[8]
 257     pa.changed   = pte[7]
 258     pa.nocache   = pte[5]
 259     pa.priv      = pte[3]
 260     pa.rd_perm   = pte[2]
 261     pa.wr_perm   = pte[1]
 262     return pa;
 263
 264
 265 # Type of operation on a "valid" input
 266 @unique
 267 class Op(Enum):
 268     OP_NONE       = 0
 269     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 270     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 271     OP_LOAD_HIT   = 3 # Cache hit on load
 272     OP_LOAD_MISS  = 4 # Load missing cache
 273     OP_LOAD_NC    = 5 # Non-cachable load
 274     OP_STORE_HIT  = 6 # Store hitting cache
 275     OP_STORE_MISS = 7 # Store missing cache
 276
 277
 278 # Cache state machine
 279 @unique
 280 class State(Enum):
 281     IDLE             = 0 # Normal load hit processing
 282     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 283     STORE_WAIT_ACK   = 2 # Store wait ack
 284     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 285
 286
 287 # Dcache operations:
 288 #
 289 # In order to make timing, we use the BRAMs with
 290 # an output buffer, which means that the BRAM
 291 # output is delayed by an extra cycle.
 292 #
 293 # Thus, the dcache has a 2-stage internal pipeline
 294 # for cache hits with no stalls.
 295 #
 296 # All other operations are handled via stalling
 297 # in the first stage.
 298 #
 299 # The second stage can thus complete a hit at the same
 300 # time as the first stage emits a stall for a complex op.
 301 #
 302 # Stage 0 register, basically contains just the latched request
 303
 304 class RegStage0(RecordObject):
 305     def __init__(self, name=None):
 306         super().__init__(name=name)
 307         self.req     = LoadStore1ToDCacheType(name="lsmem")
 308         self.tlbie   = Signal()
 309         self.doall   = Signal()
 310         self.tlbld   = Signal()
 311         self.mmu_req = Signal() # indicates source of request
 312
 313
 314 class MemAccessRequest(RecordObject):
 315     def __init__(self, name=None):
 316         super().__init__(name=name)
 317         self.op        = Signal(Op)
 318         self.valid     = Signal()
 319         self.dcbz      = Signal()
 320         self.real_addr = Signal(REAL_ADDR_BITS)
 321         self.data      = Signal(64)
 322         self.byte_sel  = Signal(8)
 323         self.hit_way   = Signal(WAY_BITS)
 324         self.same_tag  = Signal()
 325         self.mmu_req   = Signal()
 326
 327
 328 # First stage register, contains state for stage 1 of load hits
 329 # and for the state machine used by all other operations
 330 class RegStage1(RecordObject):
 331     def __init__(self, name=None):
 332         super().__init__(name=name)
 333         # Info about the request
 334         self.full             = Signal() # have uncompleted request
 335         self.mmu_req          = Signal() # request is from MMU
 336         self.req              = MemAccessRequest(name="reqmem")
 337
 338         # Cache hit state
 339         self.hit_way          = Signal(WAY_BITS)
 340         self.hit_load_valid   = Signal()
 341         self.hit_index        = Signal(INDEX_BITS)
 342         self.cache_hit        = Signal()
 343
 344         # TLB hit state
 345         self.tlb_hit          = Signal()
 346         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 347         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 348
 349         # 2-stage data buffer for data forwarded from writes to reads
 350         self.forward_data1    = Signal(64)
 351         self.forward_data2    = Signal(64)
 352         self.forward_sel1     = Signal(8)
 353         self.forward_valid1   = Signal()
 354         self.forward_way1     = Signal(WAY_BITS)
 355         self.forward_row1     = Signal(ROW_BITS)
 356         self.use_forward1     = Signal()
 357         self.forward_sel      = Signal(8)
 358
 359         # Cache miss state (reload state machine)
 360         self.state            = Signal(State)
 361         self.dcbz             = Signal()
 362         self.write_bram       = Signal()
 363         self.write_tag        = Signal()
 364         self.slow_valid       = Signal()
 365         self.wb               = WBMasterOut()
 366         self.reload_tag       = Signal(TAG_BITS)
 367         self.store_way        = Signal(WAY_BITS)
 368         self.store_row        = Signal(ROW_BITS)
 369         self.store_index      = Signal(INDEX_BITS)
 370         self.end_row_ix       = Signal(ROW_LINE_BITS)
 371         self.rows_valid       = RowPerLineValidArray()
 372         self.acks_pending     = Signal(3)
 373         self.inc_acks         = Signal()
 374         self.dec_acks         = Signal()
 375
 376         # Signals to complete (possibly with error)
 377         self.ls_valid         = Signal()
 378         self.ls_error         = Signal()
 379         self.mmu_done         = Signal()
 380         self.mmu_error        = Signal()
 381         self.cache_paradox    = Signal()
 382
 383         # Signal to complete a failed stcx.
 384         self.stcx_fail        = Signal()
 385
 386
 387 # Reservation information
 388 class Reservation(RecordObject):
 389     def __init__(self):
 390         super().__init__()
 391         self.valid = Signal()
 392         self.addr  = Signal(64-LINE_OFF_BITS)
 393
 394
 395 class DTLBUpdate(Elaboratable):
 396     def __init__(self):
 397         self.tlbie    = Signal()
 398         self.tlbwe    = Signal()
 399         self.doall    = Signal()
 400         self.updated  = Signal()
 401         self.v_updated  = Signal()
 402         self.tlb_hit    = Signal()
 403         self.tlb_req_index = Signal(TLB_SET_BITS)
 404
 405         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 406         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 407         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 408         self.repl_way        = Signal(TLB_WAY_BITS)
 409         self.eatag           = Signal(TLB_EA_TAG_BITS)
 410         self.pte_data        = Signal(TLB_PTE_BITS)
 411
 412         self.dv = Signal(TLB_PTE_WAY_BITS)
 413
 414         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 415         self.pb_out = Signal(TLB_NUM_WAYS)
 416         self.db_out = Signal(TLB_PTE_WAY_BITS)
 417
 418     def elaborate(self, platform):
 419         m = Module()
 420         comb = m.d.comb
 421         sync = m.d.sync
 422
 423         tagset   = Signal(TLB_TAG_WAY_BITS)
 424         pteset   = Signal(TLB_PTE_WAY_BITS)
 425
 426         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 427
 428         with m.If(self.tlbie & self.doall):
 429             pass # clear all back in parent
 430         with m.Elif(self.tlbie):
 431             with m.If(self.tlb_hit):
 432                 comb += db_out.eq(self.dv)
 433                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 434                 comb += self.v_updated.eq(1)
 435
 436         with m.Elif(self.tlbwe):
 437
 438             comb += tagset.eq(self.tlb_tag_way)
 439             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 440             comb += tb_out.eq(tagset)
 441
 442             comb += pteset.eq(self.tlb_pte_way)
 443             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 444             comb += pb_out.eq(pteset)
 445
 446             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 447
 448             comb += self.updated.eq(1)
 449             comb += self.v_updated.eq(1)
 450
 451         return m
 452
 453     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 454                        r0_valid, r1, cache_valid_bits, replace_way,
 455                        use_forward1_next, use_forward2_next,
 456                        req_hit_way, plru_victim, rc_ok, perm_attr,
 457                        valid_ra, perm_ok, access_ok, req_op, req_go,
 458                        tlb_pte_way,
 459                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 460                        cancel_store, req_same_tag, r0_stall, early_req_row):
 461         """Cache request parsing and hit detection
 462         """
 463
 464 class DCachePendingHit(Elaboratable):
 465
 466     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 467                       cache_valid_idx, cache_tag_set,
 468                     req_addr,
 469                     hit_set):
 470
 471         self.go          = Signal()
 472         self.virt_mode   = Signal()
 473         self.is_hit      = Signal()
 474         self.tlb_hit     = Signal()
 475         self.hit_way     = Signal(WAY_BITS)
 476         self.rel_match   = Signal()
 477         self.req_index   = Signal(INDEX_BITS)
 478         self.reload_tag  = Signal(TAG_BITS)
 479
 480         self.tlb_hit_way = tlb_hit_way
 481         self.tlb_pte_way = tlb_pte_way
 482         self.tlb_valid_way = tlb_valid_way
 483         self.cache_valid_idx = cache_valid_idx
 484         self.cache_tag_set = cache_tag_set
 485         self.req_addr = req_addr
 486         self.hit_set = hit_set
 487
 488     def elaborate(self, platform):
 489         m = Module()
 490         comb = m.d.comb
 491         sync = m.d.sync
 492
 493         go = self.go
 494         virt_mode = self.virt_mode
 495         is_hit = self.is_hit
 496         tlb_pte_way = self.tlb_pte_way
 497         tlb_valid_way = self.tlb_valid_way
 498         cache_valid_idx = self.cache_valid_idx
 499         cache_tag_set = self.cache_tag_set
 500         req_addr = self.req_addr
 501         tlb_hit_way = self.tlb_hit_way
 502         tlb_hit = self.tlb_hit
 503         hit_set = self.hit_set
 504         hit_way = self.hit_way
 505         rel_match = self.rel_match
 506         req_index = self.req_index
 507         reload_tag = self.reload_tag
 508
 509         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 510                                     for i in range(TLB_NUM_WAYS))
 511         hit_way_set = HitWaySet()
 512
 513         # Test if pending request is a hit on any way
 514         # In order to make timing in virtual mode,
 515         # when we are using the TLB, we compare each
 516         # way with each of the real addresses from each way of
 517         # the TLB, and then decide later which match to use.
 518
 519         with m.If(virt_mode):
 520             for j in range(TLB_NUM_WAYS):
 521                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 522                 s_hit       = Signal()
 523                 s_pte       = Signal(TLB_PTE_BITS)
 524                 s_ra        = Signal(REAL_ADDR_BITS)
 525                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 526                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 527                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 528                 comb += s_tag.eq(get_tag(s_ra))
 529
 530                 for i in range(NUM_WAYS):
 531                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 532                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 533                                   (read_tag(i, cache_tag_set) == s_tag)
 534                                   & tlb_valid_way[j])
 535                     with m.If(is_tag_hit):
 536                         comb += hit_way_set[j].eq(i)
 537                         comb += s_hit.eq(1)
 538                 comb += hit_set[j].eq(s_hit)
 539                 with m.If(s_tag == reload_tag):
 540                     comb += rel_matches[j].eq(1)
 541             with m.If(tlb_hit):
 542                 comb += is_hit.eq(hit_set[tlb_hit_way])
 543                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 544                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 545         with m.Else():
 546             s_tag       = Signal(TAG_BITS)
 547             comb += s_tag.eq(get_tag(req_addr))
 548             for i in range(NUM_WAYS):
 549                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 550                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 551                           (read_tag(i, cache_tag_set) == s_tag))
 552                 with m.If(is_tag_hit):
 553                     comb += hit_way.eq(i)
 554                     comb += is_hit.eq(1)
 555             with m.If(s_tag == reload_tag):
 556                 comb += rel_match.eq(1)
 557
 558         return m
 559
 560
 561 class DCache(Elaboratable):
 562     """Set associative dcache write-through
 563     TODO (in no specific order):
 564     * See list in icache.vhdl
 565     * Complete load misses on the cycle when WB data comes instead of
 566       at the end of line (this requires dealing with requests coming in
 567       while not idle...)
 568     """
 569     def __init__(self):
 570         self.d_in      = LoadStore1ToDCacheType("d_in")
 571         self.d_out     = DCacheToLoadStore1Type("d_out")
 572
 573         self.m_in      = MMUToDCacheType("m_in")
 574         self.m_out     = DCacheToMMUType("m_out")
 575
 576         self.stall_out = Signal()
 577
 578         self.wb_out    = WBMasterOut()
 579         self.wb_in     = WBSlaveOut()
 580
 581         self.log_out   = Signal(20)
 582
 583     def stage_0(self, m, r0, r1, r0_full):
 584         """Latch the request in r0.req as long as we're not stalling
 585         """
 586         comb = m.d.comb
 587         sync = m.d.sync
 588         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 589
 590         r = RegStage0("stage0")
 591
 592         # TODO, this goes in unit tests and formal proofs
 593         with m.If(~(d_in.valid & m_in.valid)):
 594             #sync += Display("request collision loadstore vs MMU")
 595             pass
 596
 597         with m.If(m_in.valid):
 598             sync += r.req.valid.eq(1)
 599             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 600             sync += r.req.dcbz.eq(0)
 601             sync += r.req.nc.eq(0)
 602             sync += r.req.reserve.eq(0)
 603             sync += r.req.virt_mode.eq(1)
 604             sync += r.req.priv_mode.eq(1)
 605             sync += r.req.addr.eq(m_in.addr)
 606             sync += r.req.data.eq(m_in.pte)
 607             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 608             sync += r.tlbie.eq(m_in.tlbie)
 609             sync += r.doall.eq(m_in.doall)
 610             sync += r.tlbld.eq(m_in.tlbld)
 611             sync += r.mmu_req.eq(1)
 612         with m.Else():
 613             sync += r.req.eq(d_in)
 614             sync += r.tlbie.eq(0)
 615             sync += r.doall.eq(0)
 616             sync += r.tlbld.eq(0)
 617             sync += r.mmu_req.eq(0)
 618             with m.If(~(r1.full & r0_full)):
 619                 sync += r0.eq(r)
 620                 sync += r0_full.eq(r.req.valid)
 621
 622     def tlb_read(self, m, r0_stall, tlb_valid_way,
 623                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 624                  dtlb_tags, dtlb_ptes):
 625         """TLB
 626         Operates in the second cycle on the request latched in r0.req.
 627         TLB updates write the entry at the end of the second cycle.
 628         """
 629         comb = m.d.comb
 630         sync = m.d.sync
 631         m_in, d_in = self.m_in, self.d_in
 632
 633         index    = Signal(TLB_SET_BITS)
 634         addrbits = Signal(TLB_SET_BITS)
 635
 636         amin = TLB_LG_PGSZ
 637         amax = TLB_LG_PGSZ + TLB_SET_BITS
 638
 639         with m.If(m_in.valid):
 640             comb += addrbits.eq(m_in.addr[amin : amax])
 641         with m.Else():
 642             comb += addrbits.eq(d_in.addr[amin : amax])
 643         comb += index.eq(addrbits)
 644
 645         # If we have any op and the previous op isn't finished,
 646         # then keep the same output for next cycle.
 647         with m.If(~r0_stall):
 648             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 649             sync += tlb_tag_way.eq(dtlb_tags[index])
 650             sync += tlb_pte_way.eq(dtlb_ptes[index])
 651
 652     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 653         """Generate TLB PLRUs
 654         """
 655         comb = m.d.comb
 656         sync = m.d.sync
 657
 658         if TLB_NUM_WAYS == 0:
 659             return
 660         for i in range(TLB_SET_SIZE):
 661             # TLB PLRU interface
 662             tlb_plru        = PLRU(WAY_BITS)
 663             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 664             tlb_plru_acc_en = Signal()
 665
 666             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 667             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 668             comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 669             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 670
 671     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 672                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 673                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 674
 675         comb = m.d.comb
 676         sync = m.d.sync
 677
 678         hitway = Signal(TLB_WAY_BITS)
 679         hit    = Signal()
 680         eatag  = Signal(TLB_EA_TAG_BITS)
 681
 682         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 683         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 684         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 685
 686         for i in range(TLB_NUM_WAYS):
 687             is_tag_hit = Signal()
 688             comb += is_tag_hit.eq(tlb_valid_way[i]
 689                                   & read_tlb_tag(i, tlb_tag_way) == eatag)
 690             with m.If(is_tag_hit):
 691                 comb += hitway.eq(i)
 692                 comb += hit.eq(1)
 693
 694         comb += tlb_hit.eq(hit & r0_valid)
 695         comb += tlb_hit_way.eq(hitway)
 696
 697         with m.If(tlb_hit):
 698             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 699         with m.Else():
 700             comb += pte.eq(0)
 701         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 702         with m.If(r0.req.virt_mode):
 703             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 704                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 705                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 706             comb += perm_attr.eq(extract_perm_attr(pte))
 707         with m.Else():
 708             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 709                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 710
 711             comb += perm_attr.reference.eq(1)
 712             comb += perm_attr.changed.eq(1)
 713             comb += perm_attr.nocache.eq(0)
 714             comb += perm_attr.priv.eq(1)
 715             comb += perm_attr.rd_perm.eq(1)
 716             comb += perm_attr.wr_perm.eq(1)
 717
 718     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 719                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 720                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 721
 722         comb = m.d.comb
 723         sync = m.d.sync
 724
 725         tlbie    = Signal()
 726         tlbwe    = Signal()
 727
 728         comb += tlbie.eq(r0_valid & r0.tlbie)
 729         comb += tlbwe.eq(r0_valid & r0.tlbld)
 730
 731         m.submodules.tlb_update = d = DTLBUpdate()
 732         with m.If(tlbie & r0.doall):
 733             # clear all valid bits at once
 734             for i in range(TLB_SET_SIZE):
 735                 sync += dtlb_valid_bits[i].eq(0)
 736         with m.If(d.updated):
 737             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 738             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 739         with m.If(d.v_updated):
 740             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 741
 742         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 743
 744         comb += d.tlbie.eq(tlbie)
 745         comb += d.tlbwe.eq(tlbwe)
 746         comb += d.doall.eq(r0.doall)
 747         comb += d.tlb_hit.eq(tlb_hit)
 748         comb += d.tlb_hit_way.eq(tlb_hit_way)
 749         comb += d.tlb_tag_way.eq(tlb_tag_way)
 750         comb += d.tlb_pte_way.eq(tlb_pte_way)
 751         comb += d.tlb_req_index.eq(tlb_req_index)
 752
 753         with m.If(tlb_hit):
 754             comb += d.repl_way.eq(tlb_hit_way)
 755         with m.Else():
 756             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 757         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 758         comb += d.pte_data.eq(r0.req.data)
 759
 760     def maybe_plrus(self, m, r1, plru_victim):
 761         """Generate PLRUs
 762         """
 763         comb = m.d.comb
 764         sync = m.d.sync
 765
 766         if TLB_NUM_WAYS == 0:
 767             return
 768
 769         for i in range(NUM_LINES):
 770             # PLRU interface
 771             plru        = PLRU(WAY_BITS)
 772             setattr(m.submodules, "plru%d" % i, plru)
 773             plru_acc_en = Signal()
 774
 775             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 776             comb += plru.acc_en.eq(plru_acc_en)
 777             comb += plru.acc.eq(r1.hit_way)
 778             comb += plru_victim[i].eq(plru.lru_o)
 779
 780     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 781         """Cache tag RAM read port
 782         """
 783         comb = m.d.comb
 784         sync = m.d.sync
 785         m_in, d_in = self.m_in, self.d_in
 786
 787         index = Signal(INDEX_BITS)
 788
 789         with m.If(r0_stall):
 790             comb += index.eq(req_index)
 791         with m.Elif(m_in.valid):
 792             comb += index.eq(get_index(m_in.addr))
 793         with m.Else():
 794             comb += index.eq(get_index(d_in.addr))
 795         sync += cache_tag_set.eq(cache_tags[index])
 796
 797     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 798                        r0_valid, r1, cache_valid_bits, replace_way,
 799                        use_forward1_next, use_forward2_next,
 800                        req_hit_way, plru_victim, rc_ok, perm_attr,
 801                        valid_ra, perm_ok, access_ok, req_op, req_go,
 802                        tlb_pte_way,
 803                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 804                        cancel_store, req_same_tag, r0_stall, early_req_row):
 805         """Cache request parsing and hit detection
 806         """
 807
 808         comb = m.d.comb
 809         sync = m.d.sync
 810         m_in, d_in = self.m_in, self.d_in
 811
 812         is_hit      = Signal()
 813         hit_way     = Signal(WAY_BITS)
 814         op          = Signal(Op)
 815         opsel       = Signal(3)
 816         go          = Signal()
 817         nc          = Signal()
 818         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 819                                   for i in range(TLB_NUM_WAYS))
 820         cache_valid_idx = Signal(INDEX_BITS)
 821
 822         # Extract line, row and tag from request
 823         comb += req_index.eq(get_index(r0.req.addr))
 824         comb += req_row.eq(get_row(r0.req.addr))
 825         comb += req_tag.eq(get_tag(ra))
 826
 827         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 828         comb += cache_valid_idx.eq(cache_valid_bits[req_index])
 829
 830         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 831                                 tlb_valid_way, tlb_hit_way,
 832                                 cache_valid_idx, cache_tag_set,
 833                                 r0.req.addr,
 834                                 hit_set)
 835
 836         comb += dc.tlb_hit.eq(tlb_hit)
 837         comb += dc.reload_tag.eq(r1.reload_tag)
 838         comb += dc.virt_mode.eq(r0.req.virt_mode)
 839         comb += dc.go.eq(go)
 840         comb += dc.req_index.eq(req_index)
 841         comb += is_hit.eq(dc.is_hit)
 842         comb += hit_way.eq(dc.hit_way)
 843         comb += req_same_tag.eq(dc.rel_match)
 844
 845         # See if the request matches the line currently being reloaded
 846         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 847                   (req_index == r1.store_index) & req_same_tag):
 848             # For a store, consider this a hit even if the row isn't
 849             # valid since it will be by the time we perform the store.
 850             # For a load, check the appropriate row valid bit.
 851             valid = r1.rows_valid[req_row % ROW_PER_LINE]
 852             comb += is_hit.eq(~r0.req.load | valid)
 853             comb += hit_way.eq(replace_way)
 854
 855         # Whether to use forwarded data for a load or not
 856         with m.If((get_row(r1.req.real_addr) == req_row) &
 857                   (r1.req.hit_way == hit_way)):
 858             # Only need to consider r1.write_bram here, since if we
 859             # are writing refill data here, then we don't have a
 860             # cache hit this cycle on the line being refilled.
 861             # (There is the possibility that the load following the
 862             # load miss that started the refill could be to the old
 863             # contents of the victim line, since it is a couple of
 864             # cycles after the refill starts before we see the updated
 865             # cache tag. In that case we don't use the bypass.)
 866             comb += use_forward1_next.eq(r1.write_bram)
 867         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 868             comb += use_forward2_next.eq(r1.forward_valid1)
 869
 870         # The way that matched on a hit
 871         comb += req_hit_way.eq(hit_way)
 872
 873         # The way to replace on a miss
 874         with m.If(r1.write_tag):
 875             comb += replace_way.eq(plru_victim[r1.store_index])
 876         with m.Else():
 877             comb += replace_way.eq(r1.store_way)
 878
 879         # work out whether we have permission for this access
 880         # NB we don't yet implement AMR, thus no KUAP
 881         comb += rc_ok.eq(perm_attr.reference
 882                          & (r0.req.load | perm_attr.changed)
 883                 )
 884         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 885                            (perm_attr.wr_perm |
 886                               (r0.req.load & perm_attr.rd_perm)))
 887         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 888         # Combine the request and cache hit status to decide what
 889         # operation needs to be done
 890         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 891         comb += op.eq(Op.OP_NONE)
 892         with m.If(go):
 893             with m.If(~access_ok):
 894                 comb += op.eq(Op.OP_BAD)
 895             with m.Elif(cancel_store):
 896                 comb += op.eq(Op.OP_STCX_FAIL)
 897             with m.Else():
 898                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 899                 with m.Switch(opsel):
 900                     with m.Case(0b101):
 901                         comb += op.eq(Op.OP_LOAD_HIT)
 902                     with m.Case(0b100):
 903                         comb += op.eq(Op.OP_LOAD_MISS)
 904                     with m.Case(0b110):
 905                         comb += op.eq(Op.OP_LOAD_NC)
 906                     with m.Case(0b001):
 907                         comb += op.eq(Op.OP_STORE_HIT)
 908                     with m.Case(0b000):
 909                         comb += op.eq(Op.OP_STORE_MISS)
 910                     with m.Case(0b010):
 911                         comb += op.eq(Op.OP_STORE_MISS)
 912                     with m.Case(0b011):
 913                         comb += op.eq(Op.OP_BAD)
 914                     with m.Case(0b111):
 915                         comb += op.eq(Op.OP_BAD)
 916                     with m.Default():
 917                         comb += op.eq(Op.OP_NONE)
 918         comb += req_op.eq(op)
 919         comb += req_go.eq(go)
 920
 921         # Version of the row number that is valid one cycle earlier
 922         # in the cases where we need to read the cache data BRAM.
 923         # If we're stalling then we need to keep reading the last
 924         # row requested.
 925         with m.If(~r0_stall):
 926             with m.If(m_in.valid):
 927                 comb += early_req_row.eq(get_row(m_in.addr))
 928             with m.Else():
 929                 comb += early_req_row.eq(get_row(d_in.addr))
 930         with m.Else():
 931             comb += early_req_row.eq(req_row)
 932
 933     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 934                          r0_valid, r0, reservation):
 935         """Handle load-with-reservation and store-conditional instructions
 936         """
 937         comb = m.d.comb
 938         sync = m.d.sync
 939
 940         with m.If(r0_valid & r0.req.reserve):
 941
 942             # XXX generate alignment interrupt if address
 943             # is not aligned XXX or if r0.req.nc = '1'
 944             with m.If(r0.req.load):
 945                 comb += set_rsrv.eq(1) # load with reservation
 946             with m.Else():
 947                 comb += clear_rsrv.eq(1) # store conditional
 948                 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
 949                     comb += cancel_store.eq(1)
 950
 951     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 952                         reservation, r0):
 953
 954         comb = m.d.comb
 955         sync = m.d.sync
 956
 957         with m.If(r0_valid & access_ok):
 958             with m.If(clear_rsrv):
 959                 sync += reservation.valid.eq(0)
 960             with m.Elif(set_rsrv):
 961                 sync += reservation.valid.eq(1)
 962                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 963
 964     def writeback_control(self, m, r1, cache_out):
 965         """Return data for loads & completion control logic
 966         """
 967         comb = m.d.comb
 968         sync = m.d.sync
 969         d_out, m_out = self.d_out, self.m_out
 970
 971         data_out = Signal(64)
 972         data_fwd = Signal(64)
 973
 974         # Use the bypass if are reading the row that was
 975         # written 1 or 2 cycles ago, including for the
 976         # slow_valid = 1 case (i.e. completing a load
 977         # miss or a non-cacheable load).
 978         with m.If(r1.use_forward1):
 979             comb += data_fwd.eq(r1.forward_data1)
 980         with m.Else():
 981             comb += data_fwd.eq(r1.forward_data2)
 982
 983         comb += data_out.eq(cache_out[r1.hit_way])
 984
 985         for i in range(8):
 986             with m.If(r1.forward_sel[i]):
 987                 dsel = data_fwd.word_select(i, 8)
 988                 comb += data_out.word_select(i, 8).eq(dsel)
 989
 990         comb += d_out.valid.eq(r1.ls_valid)
 991         comb += d_out.data.eq(data_out)
 992         comb += d_out.store_done.eq(~r1.stcx_fail)
 993         comb += d_out.error.eq(r1.ls_error)
 994         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 995
 996         # Outputs to MMU
 997         comb += m_out.done.eq(r1.mmu_done)
 998         comb += m_out.err.eq(r1.mmu_error)
 999         comb += m_out.data.eq(data_out)
1000
1001         # We have a valid load or store hit or we just completed
1002         # a slow op such as a load miss, a NC load or a store
1003         #
1004         # Note: the load hit is delayed by one cycle. However it
1005         # can still not collide with r.slow_valid (well unless I
1006         # miscalculated) because slow_valid can only be set on a
1007         # subsequent request and not on its first cycle (the state
1008         # machine must have advanced), which makes slow_valid
1009         # at least 2 cycles from the previous hit_load_valid.
1010
1011         # Sanity: Only one of these must be set in any given cycle
1012
1013         if False: # TODO: need Display to get this to work
1014             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1015             "unexpected slow_valid collision with stcx_fail"
1016
1017             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1018              "unexpected hit_load_delayed collision with slow_valid"
1019
1020         with m.If(~r1.mmu_req):
1021             # Request came from loadstore1...
1022             # Load hit case is the standard path
1023             with m.If(r1.hit_load_valid):
1024                 sync += Display("completing load hit data=%x", data_out)
1025
1026             # error cases complete without stalling
1027             with m.If(r1.ls_error):
1028                 sync += Display("completing ld/st with error")
1029
1030             # Slow ops (load miss, NC, stores)
1031             with m.If(r1.slow_valid):
1032                 sync += Display("completing store or load miss data=%x",
1033                                 data_out)
1034
1035         with m.Else():
1036             # Request came from MMU
1037             with m.If(r1.hit_load_valid):
1038                 sync += Display("completing load hit to MMU, data=%x",
1039                                 m_out.data)
1040             # error cases complete without stalling
1041             with m.If(r1.mmu_error):
1042                 sync += Display("combpleting MMU ld with error")
1043
1044             # Slow ops (i.e. load miss)
1045             with m.If(r1.slow_valid):
1046                 sync += Display("completing MMU load miss, data=%x",
1047                                 m_out.data)
1048
1049     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1050         """rams
1051         Generate a cache RAM for each way. This handles the normal
1052         reads, writes from reloads and the special store-hit update
1053         path as well.
1054
1055         Note: the BRAMs have an extra read buffer, meaning the output
1056         is pipelined an extra cycle. This differs from the
1057         icache. The writeback logic needs to take that into
1058         account by using 1-cycle delayed signals for load hits.
1059         """
1060         comb = m.d.comb
1061         wb_in = self.wb_in
1062
1063         for i in range(NUM_WAYS):
1064             do_read  = Signal(name="do_rd%d" % i)
1065             rd_addr  = Signal(ROW_BITS)
1066             do_write = Signal(name="do_wr%d" % i)
1067             wr_addr  = Signal(ROW_BITS)
1068             wr_data  = Signal(WB_DATA_BITS)
1069             wr_sel   = Signal(ROW_SIZE)
1070             wr_sel_m = Signal(ROW_SIZE)
1071             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
1072
1073             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1074             setattr(m.submodules, "cacheram_%d" % i, way)
1075
1076             comb += way.rd_en.eq(do_read)
1077             comb += way.rd_addr.eq(rd_addr)
1078             comb += _d_out.eq(way.rd_data_o)
1079             comb += way.wr_sel.eq(wr_sel_m)
1080             comb += way.wr_addr.eq(wr_addr)
1081             comb += way.wr_data.eq(wr_data)
1082
1083             # Cache hit reads
1084             comb += do_read.eq(1)
1085             comb += rd_addr.eq(early_req_row[:ROW_BITS])
1086             comb += cache_out[i].eq(_d_out)
1087
1088             # Write mux:
1089             #
1090             # Defaults to wishbone read responses (cache refill)
1091             #
1092             # For timing, the mux on wr_data/sel/addr is not
1093             # dependent on anything other than the current state.
1094
1095             with m.If(r1.write_bram):
1096                 # Write store data to BRAM.  This happens one
1097                 # cycle after the store is in r0.
1098                 comb += wr_data.eq(r1.req.data)
1099                 comb += wr_sel.eq(r1.req.byte_sel)
1100                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1101
1102                 with m.If(i == r1.req.hit_way):
1103                     comb += do_write.eq(1)
1104             with m.Else():
1105                 # Otherwise, we might be doing a reload or a DCBZ
1106                 with m.If(r1.dcbz):
1107                     comb += wr_data.eq(0)
1108                 with m.Else():
1109                     comb += wr_data.eq(wb_in.dat)
1110                 comb += wr_addr.eq(r1.store_row)
1111                 comb += wr_sel.eq(~0) # all 1s
1112
1113             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1114                       & wb_in.ack & (replace_way == i)):
1115                 comb += do_write.eq(1)
1116
1117             # Mask write selects with do_write since BRAM
1118             # doesn't have a global write-enable
1119             with m.If(do_write):
1120                 comb += wr_sel_m.eq(wr_sel)
1121
1122     # Cache hit synchronous machine for the easy case.
1123     # This handles load hits.
1124     # It also handles error cases (TLB miss, cache paradox)
1125     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1126                         req_hit_way, req_index, req_tag, access_ok,
1127                         tlb_hit, tlb_hit_way, tlb_req_index):
1128
1129         comb = m.d.comb
1130         sync = m.d.sync
1131
1132         with m.If(req_op != Op.OP_NONE):
1133             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1134                     req_op, r0.req.addr, r0.req.nc,
1135                     req_index, req_tag, req_hit_way)
1136
1137         with m.If(r0_valid):
1138             sync += r1.mmu_req.eq(r0.mmu_req)
1139
1140         # Fast path for load/store hits.
1141         # Set signals for the writeback controls.
1142         sync += r1.hit_way.eq(req_hit_way)
1143         sync += r1.hit_index.eq(req_index)
1144
1145         with m.If(req_op == Op.OP_LOAD_HIT):
1146             sync += r1.hit_load_valid.eq(1)
1147         with m.Else():
1148             sync += r1.hit_load_valid.eq(0)
1149
1150         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1151             sync += r1.cache_hit.eq(1)
1152         with m.Else():
1153             sync += r1.cache_hit.eq(0)
1154
1155         with m.If(req_op == Op.OP_BAD):
1156             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1157             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1158             sync += r1.ls_error.eq(~r0.mmu_req)
1159             sync += r1.mmu_error.eq(r0.mmu_req)
1160             sync += r1.cache_paradox.eq(access_ok)
1161
1162             with m.Else():
1163                 sync += r1.ls_error.eq(0)
1164                 sync += r1.mmu_error.eq(0)
1165                 sync += r1.cache_paradox.eq(0)
1166
1167         with m.If(req_op == Op.OP_STCX_FAIL):
1168             r1.stcx_fail.eq(1)
1169         with m.Else():
1170             sync += r1.stcx_fail.eq(0)
1171
1172         # Record TLB hit information for updating TLB PLRU
1173         sync += r1.tlb_hit.eq(tlb_hit)
1174         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1175         sync += r1.tlb_hit_index.eq(tlb_req_index)
1176
1177     # Memory accesses are handled by this state machine:
1178     #
1179     #   * Cache load miss/reload (in conjunction with "rams")
1180     #   * Load hits for non-cachable forms
1181     #   * Stores (the collision case is handled in "rams")
1182     #
1183     # All wishbone requests generation is done here.
1184     # This machine operates at stage 1.
1185     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1186                     cache_valid_bits, r0, replace_way,
1187                     req_hit_way, req_same_tag,
1188                     r0_valid, req_op, cache_tags, req_go, ra):
1189
1190         comb = m.d.comb
1191         sync = m.d.sync
1192         wb_in = self.wb_in
1193
1194         req         = MemAccessRequest("mreq_ds")
1195         acks        = Signal(3)
1196         adjust_acks = Signal(3)
1197
1198         sync += r1.use_forward1.eq(use_forward1_next)
1199         sync += r1.forward_sel.eq(0)
1200
1201         with m.If(use_forward1_next):
1202             sync += r1.forward_sel.eq(r1.req.byte_sel)
1203         with m.Elif(use_forward2_next):
1204             sync += r1.forward_sel.eq(r1.forward_sel1)
1205
1206         sync += r1.forward_data2.eq(r1.forward_data1)
1207         with m.If(r1.write_bram):
1208             sync += r1.forward_data1.eq(r1.req.data)
1209             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1210             sync += r1.forward_way1.eq(r1.req.hit_way)
1211             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1212             sync += r1.forward_valid1.eq(1)
1213         with m.Else():
1214             with m.If(r1.dcbz):
1215                 sync += r1.forward_data1.eq(0)
1216             with m.Else():
1217                 sync += r1.forward_data1.eq(wb_in.dat)
1218             sync += r1.forward_sel1.eq(~0) # all 1s
1219             sync += r1.forward_way1.eq(replace_way)
1220             sync += r1.forward_row1.eq(r1.store_row)
1221             sync += r1.forward_valid1.eq(0)
1222
1223         # One cycle pulses reset
1224         sync += r1.slow_valid.eq(0)
1225         sync += r1.write_bram.eq(0)
1226         sync += r1.inc_acks.eq(0)
1227         sync += r1.dec_acks.eq(0)
1228
1229         sync += r1.ls_valid.eq(0)
1230         # complete tlbies and TLB loads in the third cycle
1231         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1232
1233         with m.If((req_op == Op.OP_LOAD_HIT)
1234                   | (req_op == Op.OP_STCX_FAIL)):
1235             with m.If(~r0.mmu_req):
1236                 sync += r1.ls_valid.eq(1)
1237             with m.Else():
1238                 sync += r1.mmu_done.eq(1)
1239
1240         with m.If(r1.write_tag):
1241             # Store new tag in selected way
1242             for i in range(NUM_WAYS):
1243                 with m.If(i == replace_way):
1244                     ct = Signal(TAG_RAM_WIDTH)
1245                     comb += ct.eq(cache_tags[r1.store_index])
1246                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1247                     sync += cache_tags[r1.store_index].eq(ct)
1248             sync += r1.store_way.eq(replace_way)
1249             sync += r1.write_tag.eq(0)
1250
1251         # Take request from r1.req if there is one there,
1252         # else from req_op, ra, etc.
1253         with m.If(r1.full):
1254             comb += req.eq(r1.req)
1255         with m.Else():
1256             comb += req.op.eq(req_op)
1257             comb += req.valid.eq(req_go)
1258             comb += req.mmu_req.eq(r0.mmu_req)
1259             comb += req.dcbz.eq(r0.req.dcbz)
1260             comb += req.real_addr.eq(ra)
1261
1262             with m.If(~r0.req.dcbz):
1263                 comb += req.data.eq(r0.req.data)
1264             with m.Else():
1265                 comb += req.data.eq(0)
1266
1267             # Select all bytes for dcbz
1268             # and for cacheable loads
1269             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1270                 comb += req.byte_sel.eq(~0) # all 1s
1271             with m.Else():
1272                 comb += req.byte_sel.eq(r0.req.byte_sel)
1273             comb += req.hit_way.eq(req_hit_way)
1274             comb += req.same_tag.eq(req_same_tag)
1275
1276             # Store the incoming request from r0,
1277             # if it is a slow request
1278             # Note that r1.full = 1 implies req_op = OP_NONE
1279             with m.If((req_op == Op.OP_LOAD_MISS)
1280                       | (req_op == Op.OP_LOAD_NC)
1281                       | (req_op == Op.OP_STORE_MISS)
1282                       | (req_op == Op.OP_STORE_HIT)):
1283                 sync += r1.req.eq(req)
1284                 sync += r1.full.eq(1)
1285
1286         # Main state machine
1287         with m.Switch(r1.state):
1288
1289             with m.Case(State.IDLE):
1290                 # XXX check 'left downto.  probably means len(r1.wb.adr)
1291                 #                     r1.wb.adr <= req.real_addr(
1292                 #                                   r1.wb.adr'left downto 0
1293                 #                                  );
1294                 sync += r1.wb.adr.eq(req.real_addr)
1295                 sync += r1.wb.sel.eq(req.byte_sel)
1296                 sync += r1.wb.dat.eq(req.data)
1297                 sync += r1.dcbz.eq(req.dcbz)
1298
1299                 # Keep track of our index and way
1300                 # for subsequent stores.
1301                 sync += r1.store_index.eq(get_index(req.real_addr))
1302                 sync += r1.store_row.eq(get_row(req.real_addr))
1303                 sync += r1.end_row_ix.eq(
1304                          get_row_of_line(get_row(req.real_addr))
1305                         )
1306                 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1307                 sync += r1.req.same_tag.eq(1)
1308
1309                 with m.If(req.op == Op.OP_STORE_HIT):
1310                     sync += r1.store_way.eq(req.hit_way)
1311
1312                 # Reset per-row valid bits,
1313                 # ready for handling OP_LOAD_MISS
1314                 for i in range(ROW_PER_LINE):
1315                     sync += r1.rows_valid[i].eq(0)
1316
1317                 with m.If(req_op != Op.OP_NONE):
1318                     sync += Display("cache op %d", req.op)
1319
1320                 with m.Switch(req.op):
1321                     with m.Case(Op.OP_LOAD_HIT):
1322                         # stay in IDLE state
1323                         pass
1324
1325                     with m.Case(Op.OP_LOAD_MISS):
1326                         #Display(f"cache miss real addr:" \
1327                         #      f"{req_real_addr}" \
1328                         #      f" idx:{get_index(req_real_addr)}" \
1329                         #      f" tag:{get_tag(req.real_addr)}")
1330                         pass
1331
1332                         # Start the wishbone cycle
1333                         sync += r1.wb.we.eq(0)
1334                         sync += r1.wb.cyc.eq(1)
1335                         sync += r1.wb.stb.eq(1)
1336
1337                         # Track that we had one request sent
1338                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1339                         sync += r1.write_tag.eq(1)
1340
1341                     with m.Case(Op.OP_LOAD_NC):
1342                         sync += r1.wb.cyc.eq(1)
1343                         sync += r1.wb.stb.eq(1)
1344                         sync += r1.wb.we.eq(0)
1345                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1346
1347                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1348                         with m.If(~req.dcbz):
1349                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1350                             sync += r1.acks_pending.eq(1)
1351                             sync += r1.full.eq(0)
1352                             sync += r1.slow_valid.eq(1)
1353
1354                             with m.If(~req.mmu_req):
1355                                 sync += r1.ls_valid.eq(1)
1356                             with m.Else():
1357                                 sync += r1.mmu_done.eq(1)
1358
1359                             with m.If(req.op == Op.OP_STORE_HIT):
1360                                 sync += r1.write_bram.eq(1)
1361                         with m.Else():
1362                             # dcbz is handled much like a load miss except
1363                             # that we are writing to memory instead of reading
1364                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1365
1366                             with m.If(req.op == Op.OP_STORE_MISS):
1367                                 sync += r1.write_tag.eq(1)
1368
1369                         sync += r1.wb.we.eq(1)
1370                         sync += r1.wb.cyc.eq(1)
1371                         sync += r1.wb.stb.eq(1)
1372
1373                     # OP_NONE and OP_BAD do nothing
1374                     # OP_BAD & OP_STCX_FAIL were
1375                     # handled above already
1376                     with m.Case(Op.OP_NONE):
1377                         pass
1378                     with m.Case(Op.OP_BAD):
1379                         pass
1380                     with m.Case(Op.OP_STCX_FAIL):
1381                         pass
1382
1383             with m.Case(State.RELOAD_WAIT_ACK):
1384                 ld_stbs_done = Signal()
1385                 # Requests are all sent if stb is 0
1386                 comb += ld_stbs_done.eq(~r1.wb.stb)
1387
1388                 with m.If((~wb_in.stall) & r1.wb.stb):
1389                     # That was the last word?
1390                     # We are done sending.
1391                     # Clear stb and set ld_stbs_done
1392                     # so we can handle an eventual
1393                     # last ack on the same cycle.
1394                     with m.If(is_last_row_addr(r1.wb.adr, r1.end_row_ix)):
1395                         sync += r1.wb.stb.eq(0)
1396                         comb += ld_stbs_done.eq(1)
1397
1398                     # Calculate the next row address in the current cache line
1399                     rarange = r1.wb.adr[ROW_OFF_BITS : LINE_OFF_BITS]
1400                     sync += rarange.eq(rarange + 1)
1401
1402                 # Incoming acks processing
1403                 sync += r1.forward_valid1.eq(wb_in.ack)
1404                 with m.If(wb_in.ack):
1405                     sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1406
1407                     # If this is the data we were looking for,
1408                     # we can complete the request next cycle.
1409                     # Compare the whole address in case the
1410                     # request in r1.req is not the one that
1411                     # started this refill.
1412                     with m.If(r1.full & r1.req.same_tag &
1413                               ((r1.dcbz & r1.req.dcbz) |
1414                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1415                                 (r1.store_row == get_row(r1.req.real_addr))):
1416                         sync += r1.full.eq(0)
1417                         sync += r1.slow_valid.eq(1)
1418                         with m.If(~r1.mmu_req):
1419                             sync += r1.ls_valid.eq(1)
1420                         with m.Else():
1421                             sync += r1.mmu_done.eq(1)
1422                         sync += r1.forward_sel.eq(~0) # all 1s
1423                         sync += r1.use_forward1.eq(1)
1424
1425                     # Check for completion
1426                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1427                                                       r1.end_row_ix)):
1428                         # Complete wishbone cycle
1429                         sync += r1.wb.cyc.eq(0)
1430
1431                         # Cache line is now valid
1432                         cv = Signal(INDEX_BITS)
1433                         comb += cv.eq(cache_valid_bits[r1.store_index])
1434                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1435                         sync += cache_valid_bits[r1.store_index].eq(cv)
1436                         sync += r1.state.eq(State.IDLE)
1437
1438                     # Increment store row counter
1439                     sync += r1.store_row.eq(next_row(r1.store_row))
1440
1441             with m.Case(State.STORE_WAIT_ACK):
1442                 st_stbs_done = Signal()
1443                 comb += st_stbs_done.eq(~r1.wb.stb)
1444                 comb += acks.eq(r1.acks_pending)
1445
1446                 with m.If(r1.inc_acks != r1.dec_acks):
1447                     with m.If(r1.inc_acks):
1448                         comb += adjust_acks.eq(acks + 1)
1449                     with m.Else():
1450                         comb += adjust_acks.eq(acks - 1)
1451                 with m.Else():
1452                     comb += adjust_acks.eq(acks)
1453
1454                 sync += r1.acks_pending.eq(adjust_acks)
1455
1456                 # Clear stb when slave accepted request
1457                 with m.If(~wb_in.stall):
1458                     # See if there is another store waiting
1459                     # to be done which is in the same real page.
1460                     with m.If(req.valid):
1461                         ra = req.real_addr[0:SET_SIZE_BITS]
1462                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1463                         sync += r1.wb.dat.eq(req.data)
1464                         sync += r1.wb.sel.eq(req.byte_sel)
1465
1466                     with m.Elif((adjust_acks < 7) & req.same_tag &
1467                                 ((req.op == Op.OP_STORE_MISS)
1468                                  | (req.op == Op.OP_STORE_HIT))):
1469                         sync += r1.wb.stb.eq(1)
1470                         comb += st_stbs_done.eq(0)
1471
1472                         with m.If(req.op == Op.OP_STORE_HIT):
1473                             sync += r1.write_bram.eq(1)
1474                         sync += r1.full.eq(0)
1475                         sync += r1.slow_valid.eq(1)
1476
1477                         # Store requests never come from the MMU
1478                         sync += r1.ls_valid.eq(1)
1479                         comb += st_stbs_done.eq(0)
1480                         sync += r1.inc_acks.eq(1)
1481                     with m.Else():
1482                         sync += r1.wb.stb.eq(0)
1483                         comb += st_stbs_done.eq(1)
1484
1485                 # Got ack ? See if complete.
1486                 with m.If(wb_in.ack):
1487                     with m.If(st_stbs_done & (adjust_acks == 1)):
1488                         sync += r1.state.eq(State.IDLE)
1489                         sync += r1.wb.cyc.eq(0)
1490                         sync += r1.wb.stb.eq(0)
1491                     sync += r1.dec_acks.eq(1)
1492
1493             with m.Case(State.NC_LOAD_WAIT_ACK):
1494                 # Clear stb when slave accepted request
1495                 with m.If(~wb_in.stall):
1496                     sync += r1.wb.stb.eq(0)
1497
1498                 # Got ack ? complete.
1499                 with m.If(wb_in.ack):
1500                     sync += r1.state.eq(State.IDLE)
1501                     sync += r1.full.eq(0)
1502                     sync += r1.slow_valid.eq(1)
1503
1504                     with m.If(~r1.mmu_req):
1505                         sync += r1.ls_valid.eq(1)
1506                     with m.Else():
1507                         sync += r1.mmu_done.eq(1)
1508
1509                     sync += r1.forward_sel.eq(~0) # all 1s
1510                     sync += r1.use_forward1.eq(1)
1511                     sync += r1.wb.cyc.eq(0)
1512                     sync += r1.wb.stb.eq(0)
1513
1514     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1515
1516         sync = m.d.sync
1517         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1518
1519         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1520                                stall_out, req_op[:3], d_out.valid, d_out.error,
1521                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1522                                r1.wb.adr[3:6]))
1523
1524     def elaborate(self, platform):
1525
1526         m = Module()
1527         comb = m.d.comb
1528
1529         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1530         cache_tags       = CacheTagArray()
1531         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1532         cache_valid_bits = CacheValidBitsArray()
1533
1534         # TODO attribute ram_style : string;
1535         # TODO attribute ram_style of cache_tags : signal is "distributed";
1536
1537         """note: these are passed to nmigen.hdl.Memory as "attributes".
1538            don't know how, just that they are.
1539         """
1540         dtlb_valid_bits = TLBValidBitsArray()
1541         dtlb_tags       = TLBTagsArray()
1542         dtlb_ptes       = TLBPtesArray()
1543         # TODO attribute ram_style of
1544         #  dtlb_tags : signal is "distributed";
1545         # TODO attribute ram_style of
1546         #  dtlb_ptes : signal is "distributed";
1547
1548         r0      = RegStage0("r0")
1549         r0_full = Signal()
1550
1551         r1 = RegStage1("r1")
1552
1553         reservation = Reservation()
1554
1555         # Async signals on incoming request
1556         req_index    = Signal(INDEX_BITS)
1557         req_row      = Signal(ROW_BITS)
1558         req_hit_way  = Signal(WAY_BITS)
1559         req_tag      = Signal(TAG_BITS)
1560         req_op       = Signal(Op)
1561         req_data     = Signal(64)
1562         req_same_tag = Signal()
1563         req_go       = Signal()
1564
1565         early_req_row     = Signal(ROW_BITS)
1566
1567         cancel_store      = Signal()
1568         set_rsrv          = Signal()
1569         clear_rsrv        = Signal()
1570
1571         r0_valid          = Signal()
1572         r0_stall          = Signal()
1573
1574         use_forward1_next = Signal()
1575         use_forward2_next = Signal()
1576
1577         cache_out         = CacheRamOut()
1578
1579         plru_victim       = PLRUOut()
1580         replace_way       = Signal(WAY_BITS)
1581
1582         # Wishbone read/write/cache write formatting signals
1583         bus_sel           = Signal(8)
1584
1585         # TLB signals
1586         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1587         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1588         tlb_valid_way = Signal(TLB_NUM_WAYS)
1589         tlb_req_index = Signal(TLB_SET_BITS)
1590         tlb_hit       = Signal()
1591         tlb_hit_way   = Signal(TLB_WAY_BITS)
1592         pte           = Signal(TLB_PTE_BITS)
1593         ra            = Signal(REAL_ADDR_BITS)
1594         valid_ra      = Signal()
1595         perm_attr     = PermAttr("dc_perms")
1596         rc_ok         = Signal()
1597         perm_ok       = Signal()
1598         access_ok     = Signal()
1599
1600         tlb_plru_victim = TLBPLRUOut()
1601
1602         # we don't yet handle collisions between loadstore1 requests
1603         # and MMU requests
1604         comb += self.m_out.stall.eq(0)
1605
1606         # Hold off the request in r0 when r1 has an uncompleted request
1607         comb += r0_stall.eq(r0_full & r1.full)
1608         comb += r0_valid.eq(r0_full & ~r1.full)
1609         comb += self.stall_out.eq(r0_stall)
1610
1611         # Wire up wishbone request latch out of stage 1
1612         comb += self.wb_out.eq(r1.wb)
1613
1614         # call sub-functions putting everything together, using shared
1615         # signals established above
1616         self.stage_0(m, r0, r1, r0_full)
1617         self.tlb_read(m, r0_stall, tlb_valid_way,
1618                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1619                       dtlb_tags, dtlb_ptes)
1620         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1621                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1622                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1623         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1624                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1625                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1626         self.maybe_plrus(m, r1, plru_victim)
1627         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1628         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1629         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1630                            r0_valid, r1, cache_valid_bits, replace_way,
1631                            use_forward1_next, use_forward2_next,
1632                            req_hit_way, plru_victim, rc_ok, perm_attr,
1633                            valid_ra, perm_ok, access_ok, req_op, req_go,
1634                            tlb_pte_way,
1635                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1636                            cancel_store, req_same_tag, r0_stall, early_req_row)
1637         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1638                            r0_valid, r0, reservation)
1639         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1640                            reservation, r0)
1641         self.writeback_control(m, r1, cache_out)
1642         self.rams(m, r1, early_req_row, cache_out, replace_way)
1643         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1644                         req_hit_way, req_index, req_tag, access_ok,
1645                         tlb_hit, tlb_hit_way, tlb_req_index)
1646         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1647                     cache_valid_bits, r0, replace_way,
1648                     req_hit_way, req_same_tag,
1649                          r0_valid, req_op, cache_tags, req_go, ra)
1650         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1651
1652         return m
1653
1654 def dcache_load(dut, addr, nc=0):
1655     yield dut.d_in.load.eq(1)
1656     yield dut.d_in.nc.eq(nc)
1657     yield dut.d_in.addr.eq(addr)
1658     yield dut.d_in.valid.eq(1)
1659     yield
1660     yield dut.d_in.valid.eq(0)
1661     yield
1662     while not (yield dut.d_out.valid):
1663         yield
1664     data = yield dut.d_out.data
1665     return data
1666
1667
1668 def dcache_store(dut, addr, data, nc=0):
1669     yield dut.d_in.load.eq(0)
1670     yield dut.d_in.nc.eq(nc)
1671     yield dut.d_in.data.eq(data)
1672     yield dut.d_in.byte_sel.eq(~0)
1673     yield dut.d_in.addr.eq(addr)
1674     yield dut.d_in.valid.eq(1)
1675     yield
1676     yield dut.d_in.valid.eq(0)
1677     yield dut.d_in.byte_sel.eq(0)
1678     yield
1679     while not (yield dut.d_out.valid):
1680         yield
1681
1682
1683 def dcache_random_sim(dut):
1684
1685     # start with stack of zeros
1686     sim_mem = [0] * 512
1687
1688     # clear stuff
1689     yield dut.d_in.valid.eq(0)
1690     yield dut.d_in.load.eq(0)
1691     yield dut.d_in.priv_mode.eq(1)
1692     yield dut.d_in.nc.eq(0)
1693     yield dut.d_in.addr.eq(0)
1694     yield dut.d_in.data.eq(0)
1695     yield dut.m_in.valid.eq(0)
1696     yield dut.m_in.addr.eq(0)
1697     yield dut.m_in.pte.eq(0)
1698     # wait 4 * clk_period
1699     yield
1700     yield
1701     yield
1702     yield
1703
1704     print ()
1705
1706     for i in range(256):
1707         addr = randint(0, 255)
1708         data = randint(0, (1<<64)-1)
1709         sim_mem[addr] = data
1710         addr *= 8
1711
1712         print ("testing %x data %x" % (addr, data))
1713
1714         yield from dcache_load(dut, addr)
1715         yield from dcache_store(dut, addr, data)
1716
1717         addr = randint(0, 255)
1718         sim_data = sim_mem[addr]
1719         addr *= 8
1720
1721         data = yield from dcache_load(dut, addr)
1722         assert data == sim_data, \
1723             "check %x data %x != %x" % (addr, data, sim_data)
1724
1725     for addr in range(8):
1726         data = yield from dcache_load(dut, addr*8)
1727         assert data == sim_mem[addr], \
1728             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1729
1730 def dcache_sim(dut):
1731     # clear stuff
1732     yield dut.d_in.valid.eq(0)
1733     yield dut.d_in.load.eq(0)
1734     yield dut.d_in.priv_mode.eq(1)
1735     yield dut.d_in.nc.eq(0)
1736     yield dut.d_in.addr.eq(0)
1737     yield dut.d_in.data.eq(0)
1738     yield dut.m_in.valid.eq(0)
1739     yield dut.m_in.addr.eq(0)
1740     yield dut.m_in.pte.eq(0)
1741     # wait 4 * clk_period
1742     yield
1743     yield
1744     yield
1745     yield
1746
1747     # Cacheable read of address 4
1748     data = yield from dcache_load(dut, 0x4)
1749     addr = yield dut.d_in.addr
1750     assert data == 0x0000000100000000, \
1751         f"data @%x=%x expected 0x0000000100000000" % (addr, data)
1752
1753     # Cacheable read of address 30
1754     data = yield from dcache_load(dut, 0x530)
1755     addr = yield dut.d_in.addr
1756     assert data == 0x0000014D0000014C, \
1757         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1758
1759     # 2nd Cacheable read of address 30
1760     data = yield from dcache_load(dut, 0x530)
1761     addr = yield dut.d_in.addr
1762     assert data == 0x0000014D0000014C, \
1763         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1764
1765     # Non-cacheable read of address 100
1766     data = yield from dcache_load(dut, 0x100, nc=1)
1767     addr = yield dut.d_in.addr
1768     assert data == 0x0000004100000040, \
1769         f"data @%x=%x expected 0000004100000040" % (addr, data)
1770
1771     # Store at address 530
1772     yield from dcache_store(dut, 0x530, 0x121)
1773
1774     # Store at address 30
1775     yield from dcache_store(dut, 0x530, 0x12345678)
1776
1777     # 3nd Cacheable read of address 530
1778     data = yield from dcache_load(dut, 0x530)
1779     addr = yield dut.d_in.addr
1780     assert data == 0x12345678, \
1781         f"data @%x=%x expected 0x12345678" % (addr, data)
1782
1783     yield
1784     yield
1785     yield
1786     yield
1787
1788
1789 def test_dcache(mem, test_fn, test_name):
1790     dut = DCache()
1791
1792     memory = Memory(width=64, depth=16*64, init=mem)
1793     sram = SRAM(memory=memory, granularity=8)
1794
1795     m = Module()
1796     m.submodules.dcache = dut
1797     m.submodules.sram = sram
1798
1799     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1800     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1801     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1802     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1803     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr[3:])
1804     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1805
1806     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1807     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1808
1809     # nmigen Simulation
1810     sim = Simulator(m)
1811     sim.add_clock(1e-6)
1812
1813     sim.add_sync_process(wrap(test_fn(dut)))
1814     with sim.write_vcd('test_dcache_%s.vcd' % test_name):
1815         sim.run()
1816
1817 if __name__ == '__main__':
1818     dut = DCache()
1819     vl = rtlil.convert(dut, ports=[])
1820     with open("test_dcache.il", "w") as f:
1821         f.write(vl)
1822
1823     mem = []
1824     for i in range(0,512):
1825         mem.append((i*2)| ((i*2+1)<<32))
1826
1827     test_dcache(mem, dcache_sim, "quick")
1828     test_dcache(None, dcache_random_sim, "random")
1829