src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 try:
  11     from nmigen.hdl.ast import Display
  12 except ImportError:
  13     def Display(*args):
  14         return []
  15
  16 from random import randint
  17
  18 from nmigen.cli import main
  19 from nmutil.iocontrol import RecordObject
  20 from nmutil.util import wrap
  21 from nmigen.utils import log2_int
  22 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  23                                      DCacheToLoadStore1Type,
  24                                      MMUToDCacheType,
  25                                      DCacheToMMUType)
  26
  27 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  28                                 WBAddrType, WBDataType, WBSelType,
  29                                 WBMasterOut, WBSlaveOut,
  30                                 WBMasterOutVector, WBSlaveOutVector,
  31                                 WBIOMasterOut, WBIOSlaveOut)
  32
  33 from soc.experiment.cache_ram import CacheRam
  34 from soc.experiment.plru import PLRU
  35
  36 # for test
  37 from nmigen_soc.wishbone.sram import SRAM
  38 from nmigen import Memory
  39 from nmigen.cli import rtlil
  40 if True:
  41     from nmigen.back.pysim import Simulator, Delay, Settle
  42 else:
  43     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  44
  45
  46 # TODO: make these parameters of DCache at some point
  47 LINE_SIZE = 64    # Line size in bytes
  48 NUM_LINES = 16    # Number of lines in a set
  49 NUM_WAYS = 4      # Number of ways
  50 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  51 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  52 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  53 LOG_LENGTH = 0    # Non-zero to enable log data collection
  54
  55 # BRAM organisation: We never access more than
  56 #     -- WB_DATA_BITS at a time so to save
  57 #     -- resources we make the array only that wide, and
  58 #     -- use consecutive indices for to make a cache "line"
  59 #     --
  60 #     -- ROW_SIZE is the width in bytes of the BRAM
  61 #     -- (based on WB, so 64-bits)
  62 ROW_SIZE = WB_DATA_BITS // 8;
  63
  64 # ROW_PER_LINE is the number of row (wishbone
  65 # transactions) in a line
  66 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  67
  68 # BRAM_ROWS is the number of rows in BRAM needed
  69 # to represent the full dcache
  70 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  71
  72
  73 # Bit fields counts in the address
  74
  75 # REAL_ADDR_BITS is the number of real address
  76 # bits that we store
  77 REAL_ADDR_BITS = 56
  78
  79 # ROW_BITS is the number of bits to select a row
  80 ROW_BITS = log2_int(BRAM_ROWS)
  81
  82 # ROW_LINE_BITS is the number of bits to select
  83 # a row within a line
  84 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  85
  86 # LINE_OFF_BITS is the number of bits for
  87 # the offset in a cache line
  88 LINE_OFF_BITS = log2_int(LINE_SIZE)
  89
  90 # ROW_OFF_BITS is the number of bits for
  91 # the offset in a row
  92 ROW_OFF_BITS = log2_int(ROW_SIZE)
  93
  94 # INDEX_BITS is the number if bits to
  95 # select a cache line
  96 INDEX_BITS = log2_int(NUM_LINES)
  97
  98 # SET_SIZE_BITS is the log base 2 of the set size
  99 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 100
 101 # TAG_BITS is the number of bits of
 102 # the tag part of the address
 103 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 104
 105 # TAG_WIDTH is the width in bits of each way of the tag RAM
 106 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 107
 108 # WAY_BITS is the number of bits to select a way
 109 WAY_BITS = log2_int(NUM_WAYS)
 110
 111 # Example of layout for 32 lines of 64 bytes:
 112 layout = """\
 113   ..  tag    |index|  line  |
 114   ..         |   row   |    |
 115   ..         |     |---|    | ROW_LINE_BITS  (3)
 116   ..         |     |--- - --| LINE_OFF_BITS (6)
 117   ..         |         |- --| ROW_OFF_BITS  (3)
 118   ..         |----- ---|    | ROW_BITS      (8)
 119   ..         |-----|        | INDEX_BITS    (5)
 120   .. --------|              | TAG_BITS      (45)
 121 """
 122 print (layout)
 123 print ("Dcache TAG %d IDX %d ROW %d ROFF %d LOFF %d RLB %d" % \
 124             (TAG_BITS, INDEX_BITS, ROW_BITS,
 125              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 126 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 127 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 128 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 129
 130 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 131
 132 def CacheTagArray():
 133     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 134                         for x in range(NUM_LINES))
 135
 136 def CacheValidBitsArray():
 137     return Array(Signal(INDEX_BITS, name="cachevalid_%d" % x) \
 138                         for x in range(NUM_LINES))
 139
 140 def RowPerLineValidArray():
 141     return Array(Signal(name="rows_valid%d" % x) \
 142                         for x in range(ROW_PER_LINE))
 143
 144 # L1 TLB
 145 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 146 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 147 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 148 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 149 TLB_PTE_BITS     = 64
 150 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 151
 152 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 153 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
 154 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
 155 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
 156 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 157 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 158         "geometry bits don't add up"
 159 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 160         "geometry bits don't add up"
 161 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 162          "geometry bits don't add up"
 163 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 164 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 165
 166
 167 def TLBValidBitsArray():
 168     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 169
 170 def TLBTagEAArray():
 171     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 172
 173 def TLBTagsArray():
 174     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 175
 176 def TLBPtesArray():
 177     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 178
 179 def HitWaySet():
 180     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 181                         for x in range(TLB_NUM_WAYS))
 182
 183 # Cache RAM interface
 184 def CacheRamOut():
 185     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 186                  for x in range(NUM_WAYS))
 187
 188 # PLRU output interface
 189 def PLRUOut():
 190     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 191
 192 # TLB PLRU output interface
 193 def TLBPLRUOut():
 194     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 195
 196 # Helper functions to decode incoming requests
 197 #
 198 # Return the cache line index (tag index) for an address
 199 def get_index(addr):
 200     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 201
 202 # Return the cache row index (data memory) for an address
 203 def get_row(addr):
 204     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 205
 206 # Return the index of a row within a line
 207 def get_row_of_line(row):
 208     return row[:ROW_BITS][:ROW_LINE_BITS]
 209
 210 # Returns whether this is the last row of a line
 211 def is_last_row_addr(addr, last):
 212     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 213
 214 # Returns whether this is the last row of a line
 215 def is_last_row(row, last):
 216     return get_row_of_line(row) == last
 217
 218 # Return the next row in the current cache line. We use a
 219 # dedicated function in order to limit the size of the
 220 # generated adder to be only the bits within a cache line
 221 # (3 bits with default settings)
 222 def next_row(row):
 223     row_v = row[0:ROW_LINE_BITS] + 1
 224     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 225
 226 # Get the tag value from the address
 227 def get_tag(addr):
 228     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 229
 230 # Read a tag from a tag memory row
 231 def read_tag(way, tagset):
 232     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 233
 234 # Read a TLB tag from a TLB tag memory row
 235 def read_tlb_tag(way, tags):
 236     return tags.word_select(way, TLB_EA_TAG_BITS)
 237
 238 # Write a TLB tag to a TLB tag memory row
 239 def write_tlb_tag(way, tags, tag):
 240     return read_tlb_tag(way, tags).eq(tag)
 241
 242 # Read a PTE from a TLB PTE memory row
 243 def read_tlb_pte(way, ptes):
 244     return ptes.word_select(way, TLB_PTE_BITS)
 245
 246 def write_tlb_pte(way, ptes, newpte):
 247     return read_tlb_pte(way, ptes).eq(newpte)
 248
 249
 250 # Record for storing permission, attribute, etc. bits from a PTE
 251 class PermAttr(RecordObject):
 252     def __init__(self, name=None):
 253         super().__init__(name=name)
 254         self.reference = Signal()
 255         self.changed   = Signal()
 256         self.nocache   = Signal()
 257         self.priv      = Signal()
 258         self.rd_perm   = Signal()
 259         self.wr_perm   = Signal()
 260
 261
 262 def extract_perm_attr(pte):
 263     pa = PermAttr()
 264     pa.reference = pte[8]
 265     pa.changed   = pte[7]
 266     pa.nocache   = pte[5]
 267     pa.priv      = pte[3]
 268     pa.rd_perm   = pte[2]
 269     pa.wr_perm   = pte[1]
 270     return pa;
 271
 272
 273 # Type of operation on a "valid" input
 274 @unique
 275 class Op(Enum):
 276     OP_NONE       = 0
 277     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 278     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 279     OP_LOAD_HIT   = 3 # Cache hit on load
 280     OP_LOAD_MISS  = 4 # Load missing cache
 281     OP_LOAD_NC    = 5 # Non-cachable load
 282     OP_STORE_HIT  = 6 # Store hitting cache
 283     OP_STORE_MISS = 7 # Store missing cache
 284
 285
 286 # Cache state machine
 287 @unique
 288 class State(Enum):
 289     IDLE             = 0 # Normal load hit processing
 290     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 291     STORE_WAIT_ACK   = 2 # Store wait ack
 292     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 293
 294
 295 # Dcache operations:
 296 #
 297 # In order to make timing, we use the BRAMs with
 298 # an output buffer, which means that the BRAM
 299 # output is delayed by an extra cycle.
 300 #
 301 # Thus, the dcache has a 2-stage internal pipeline
 302 # for cache hits with no stalls.
 303 #
 304 # All other operations are handled via stalling
 305 # in the first stage.
 306 #
 307 # The second stage can thus complete a hit at the same
 308 # time as the first stage emits a stall for a complex op.
 309 #
 310 # Stage 0 register, basically contains just the latched request
 311
 312 class RegStage0(RecordObject):
 313     def __init__(self, name=None):
 314         super().__init__(name=name)
 315         self.req     = LoadStore1ToDCacheType(name="lsmem")
 316         self.tlbie   = Signal()
 317         self.doall   = Signal()
 318         self.tlbld   = Signal()
 319         self.mmu_req = Signal() # indicates source of request
 320
 321
 322 class MemAccessRequest(RecordObject):
 323     def __init__(self, name=None):
 324         super().__init__(name=name)
 325         self.op        = Signal(Op)
 326         self.valid     = Signal()
 327         self.dcbz      = Signal()
 328         self.real_addr = Signal(REAL_ADDR_BITS)
 329         self.data      = Signal(64)
 330         self.byte_sel  = Signal(8)
 331         self.hit_way   = Signal(WAY_BITS)
 332         self.same_tag  = Signal()
 333         self.mmu_req   = Signal()
 334
 335
 336 # First stage register, contains state for stage 1 of load hits
 337 # and for the state machine used by all other operations
 338 class RegStage1(RecordObject):
 339     def __init__(self, name=None):
 340         super().__init__(name=name)
 341         # Info about the request
 342         self.full             = Signal() # have uncompleted request
 343         self.mmu_req          = Signal() # request is from MMU
 344         self.req              = MemAccessRequest(name="reqmem")
 345
 346         # Cache hit state
 347         self.hit_way          = Signal(WAY_BITS)
 348         self.hit_load_valid   = Signal()
 349         self.hit_index        = Signal(INDEX_BITS)
 350         self.cache_hit        = Signal()
 351
 352         # TLB hit state
 353         self.tlb_hit          = Signal()
 354         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 355         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 356
 357         # 2-stage data buffer for data forwarded from writes to reads
 358         self.forward_data1    = Signal(64)
 359         self.forward_data2    = Signal(64)
 360         self.forward_sel1     = Signal(8)
 361         self.forward_valid1   = Signal()
 362         self.forward_way1     = Signal(WAY_BITS)
 363         self.forward_row1     = Signal(ROW_BITS)
 364         self.use_forward1     = Signal()
 365         self.forward_sel      = Signal(8)
 366
 367         # Cache miss state (reload state machine)
 368         self.state            = Signal(State)
 369         self.dcbz             = Signal()
 370         self.write_bram       = Signal()
 371         self.write_tag        = Signal()
 372         self.slow_valid       = Signal()
 373         self.wb               = WBMasterOut("wb")
 374         self.reload_tag       = Signal(TAG_BITS)
 375         self.store_way        = Signal(WAY_BITS)
 376         self.store_row        = Signal(ROW_BITS)
 377         self.store_index      = Signal(INDEX_BITS)
 378         self.end_row_ix       = Signal(ROW_LINE_BITS)
 379         self.rows_valid       = RowPerLineValidArray()
 380         self.acks_pending     = Signal(3)
 381         self.inc_acks         = Signal()
 382         self.dec_acks         = Signal()
 383
 384         # Signals to complete (possibly with error)
 385         self.ls_valid         = Signal()
 386         self.ls_error         = Signal()
 387         self.mmu_done         = Signal()
 388         self.mmu_error        = Signal()
 389         self.cache_paradox    = Signal()
 390
 391         # Signal to complete a failed stcx.
 392         self.stcx_fail        = Signal()
 393
 394
 395 # Reservation information
 396 class Reservation(RecordObject):
 397     def __init__(self):
 398         super().__init__()
 399         self.valid = Signal()
 400         self.addr  = Signal(64-LINE_OFF_BITS)
 401
 402
 403 class DTLBUpdate(Elaboratable):
 404     def __init__(self):
 405         self.tlbie    = Signal()
 406         self.tlbwe    = Signal()
 407         self.doall    = Signal()
 408         self.updated  = Signal()
 409         self.v_updated  = Signal()
 410         self.tlb_hit    = Signal()
 411         self.tlb_req_index = Signal(TLB_SET_BITS)
 412
 413         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 414         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 415         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 416         self.repl_way        = Signal(TLB_WAY_BITS)
 417         self.eatag           = Signal(TLB_EA_TAG_BITS)
 418         self.pte_data        = Signal(TLB_PTE_BITS)
 419
 420         self.dv = Signal(TLB_PTE_WAY_BITS)
 421
 422         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 423         self.pb_out = Signal(TLB_NUM_WAYS)
 424         self.db_out = Signal(TLB_PTE_WAY_BITS)
 425
 426     def elaborate(self, platform):
 427         m = Module()
 428         comb = m.d.comb
 429         sync = m.d.sync
 430
 431         tagset   = Signal(TLB_TAG_WAY_BITS)
 432         pteset   = Signal(TLB_PTE_WAY_BITS)
 433
 434         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 435
 436         with m.If(self.tlbie & self.doall):
 437             pass # clear all back in parent
 438         with m.Elif(self.tlbie):
 439             with m.If(self.tlb_hit):
 440                 comb += db_out.eq(self.dv)
 441                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 442                 comb += self.v_updated.eq(1)
 443
 444         with m.Elif(self.tlbwe):
 445
 446             comb += tagset.eq(self.tlb_tag_way)
 447             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 448             comb += tb_out.eq(tagset)
 449
 450             comb += pteset.eq(self.tlb_pte_way)
 451             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 452             comb += pb_out.eq(pteset)
 453
 454             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 455
 456             comb += self.updated.eq(1)
 457             comb += self.v_updated.eq(1)
 458
 459         return m
 460
 461     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 462                        r0_valid, r1, cache_valid_bits, replace_way,
 463                        use_forward1_next, use_forward2_next,
 464                        req_hit_way, plru_victim, rc_ok, perm_attr,
 465                        valid_ra, perm_ok, access_ok, req_op, req_go,
 466                        tlb_pte_way,
 467                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 468                        cancel_store, req_same_tag, r0_stall, early_req_row):
 469         """Cache request parsing and hit detection
 470         """
 471
 472 class DCachePendingHit(Elaboratable):
 473
 474     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 475                       cache_valid_idx, cache_tag_set,
 476                     req_addr,
 477                     hit_set):
 478
 479         self.go          = Signal()
 480         self.virt_mode   = Signal()
 481         self.is_hit      = Signal()
 482         self.tlb_hit     = Signal()
 483         self.hit_way     = Signal(WAY_BITS)
 484         self.rel_match   = Signal()
 485         self.req_index   = Signal(INDEX_BITS)
 486         self.reload_tag  = Signal(TAG_BITS)
 487
 488         self.tlb_hit_way = tlb_hit_way
 489         self.tlb_pte_way = tlb_pte_way
 490         self.tlb_valid_way = tlb_valid_way
 491         self.cache_valid_idx = cache_valid_idx
 492         self.cache_tag_set = cache_tag_set
 493         self.req_addr = req_addr
 494         self.hit_set = hit_set
 495
 496     def elaborate(self, platform):
 497         m = Module()
 498         comb = m.d.comb
 499         sync = m.d.sync
 500
 501         go = self.go
 502         virt_mode = self.virt_mode
 503         is_hit = self.is_hit
 504         tlb_pte_way = self.tlb_pte_way
 505         tlb_valid_way = self.tlb_valid_way
 506         cache_valid_idx = self.cache_valid_idx
 507         cache_tag_set = self.cache_tag_set
 508         req_addr = self.req_addr
 509         tlb_hit_way = self.tlb_hit_way
 510         tlb_hit = self.tlb_hit
 511         hit_set = self.hit_set
 512         hit_way = self.hit_way
 513         rel_match = self.rel_match
 514         req_index = self.req_index
 515         reload_tag = self.reload_tag
 516
 517         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 518                                     for i in range(TLB_NUM_WAYS))
 519         hit_way_set = HitWaySet()
 520
 521         # Test if pending request is a hit on any way
 522         # In order to make timing in virtual mode,
 523         # when we are using the TLB, we compare each
 524         # way with each of the real addresses from each way of
 525         # the TLB, and then decide later which match to use.
 526
 527         with m.If(virt_mode):
 528             for j in range(TLB_NUM_WAYS):
 529                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 530                 s_hit       = Signal()
 531                 s_pte       = Signal(TLB_PTE_BITS)
 532                 s_ra        = Signal(REAL_ADDR_BITS)
 533                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 534                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 535                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 536                 comb += s_tag.eq(get_tag(s_ra))
 537
 538                 for i in range(NUM_WAYS):
 539                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 540                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 541                                   (read_tag(i, cache_tag_set) == s_tag)
 542                                   & tlb_valid_way[j])
 543                     with m.If(is_tag_hit):
 544                         comb += hit_way_set[j].eq(i)
 545                         comb += s_hit.eq(1)
 546                 comb += hit_set[j].eq(s_hit)
 547                 with m.If(s_tag == reload_tag):
 548                     comb += rel_matches[j].eq(1)
 549             with m.If(tlb_hit):
 550                 comb += is_hit.eq(hit_set[tlb_hit_way])
 551                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 552                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 553         with m.Else():
 554             s_tag       = Signal(TAG_BITS)
 555             comb += s_tag.eq(get_tag(req_addr))
 556             for i in range(NUM_WAYS):
 557                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 558                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 559                           (read_tag(i, cache_tag_set) == s_tag))
 560                 with m.If(is_tag_hit):
 561                     comb += hit_way.eq(i)
 562                     comb += is_hit.eq(1)
 563             with m.If(s_tag == reload_tag):
 564                 comb += rel_match.eq(1)
 565
 566         return m
 567
 568
 569 class DCache(Elaboratable):
 570     """Set associative dcache write-through
 571     TODO (in no specific order):
 572     * See list in icache.vhdl
 573     * Complete load misses on the cycle when WB data comes instead of
 574       at the end of line (this requires dealing with requests coming in
 575       while not idle...)
 576     """
 577     def __init__(self):
 578         self.d_in      = LoadStore1ToDCacheType("d_in")
 579         self.d_out     = DCacheToLoadStore1Type("d_out")
 580
 581         self.m_in      = MMUToDCacheType("m_in")
 582         self.m_out     = DCacheToMMUType("m_out")
 583
 584         self.stall_out = Signal()
 585
 586         self.wb_out    = WBMasterOut()
 587         self.wb_in     = WBSlaveOut()
 588
 589         self.log_out   = Signal(20)
 590
 591     def stage_0(self, m, r0, r1, r0_full):
 592         """Latch the request in r0.req as long as we're not stalling
 593         """
 594         comb = m.d.comb
 595         sync = m.d.sync
 596         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 597
 598         r = RegStage0("stage0")
 599
 600         # TODO, this goes in unit tests and formal proofs
 601         with m.If(d_in.valid & m_in.valid):
 602             sync += Display("request collision loadstore vs MMU")
 603
 604         with m.If(m_in.valid):
 605             sync += r.req.valid.eq(1)
 606             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 607             sync += r.req.dcbz.eq(0)
 608             sync += r.req.nc.eq(0)
 609             sync += r.req.reserve.eq(0)
 610             sync += r.req.virt_mode.eq(1)
 611             sync += r.req.priv_mode.eq(1)
 612             sync += r.req.addr.eq(m_in.addr)
 613             sync += r.req.data.eq(m_in.pte)
 614             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 615             sync += r.tlbie.eq(m_in.tlbie)
 616             sync += r.doall.eq(m_in.doall)
 617             sync += r.tlbld.eq(m_in.tlbld)
 618             sync += r.mmu_req.eq(1)
 619         with m.Else():
 620             sync += r.req.eq(d_in)
 621             sync += r.tlbie.eq(0)
 622             sync += r.doall.eq(0)
 623             sync += r.tlbld.eq(0)
 624             sync += r.mmu_req.eq(0)
 625             with m.If(~(r1.full & r0_full)):
 626                 sync += r0.eq(r)
 627                 sync += r0_full.eq(r.req.valid)
 628
 629     def tlb_read(self, m, r0_stall, tlb_valid_way,
 630                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 631                  dtlb_tags, dtlb_ptes):
 632         """TLB
 633         Operates in the second cycle on the request latched in r0.req.
 634         TLB updates write the entry at the end of the second cycle.
 635         """
 636         comb = m.d.comb
 637         sync = m.d.sync
 638         m_in, d_in = self.m_in, self.d_in
 639
 640         index    = Signal(TLB_SET_BITS)
 641         addrbits = Signal(TLB_SET_BITS)
 642
 643         amin = TLB_LG_PGSZ
 644         amax = TLB_LG_PGSZ + TLB_SET_BITS
 645
 646         with m.If(m_in.valid):
 647             comb += addrbits.eq(m_in.addr[amin : amax])
 648         with m.Else():
 649             comb += addrbits.eq(d_in.addr[amin : amax])
 650         comb += index.eq(addrbits)
 651
 652         # If we have any op and the previous op isn't finished,
 653         # then keep the same output for next cycle.
 654         with m.If(~r0_stall):
 655             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 656             sync += tlb_tag_way.eq(dtlb_tags[index])
 657             sync += tlb_pte_way.eq(dtlb_ptes[index])
 658
 659     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 660         """Generate TLB PLRUs
 661         """
 662         comb = m.d.comb
 663         sync = m.d.sync
 664
 665         if TLB_NUM_WAYS == 0:
 666             return
 667         for i in range(TLB_SET_SIZE):
 668             # TLB PLRU interface
 669             tlb_plru        = PLRU(WAY_BITS)
 670             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 671             tlb_plru_acc_en = Signal()
 672
 673             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 674             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 675             comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 676             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 677
 678     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 679                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 680                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 681
 682         comb = m.d.comb
 683         sync = m.d.sync
 684
 685         hitway = Signal(TLB_WAY_BITS)
 686         hit    = Signal()
 687         eatag  = Signal(TLB_EA_TAG_BITS)
 688
 689         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 690         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 691         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 692
 693         for i in range(TLB_NUM_WAYS):
 694             is_tag_hit = Signal()
 695             comb += is_tag_hit.eq(tlb_valid_way[i]
 696                                   & read_tlb_tag(i, tlb_tag_way) == eatag)
 697             with m.If(is_tag_hit):
 698                 comb += hitway.eq(i)
 699                 comb += hit.eq(1)
 700
 701         comb += tlb_hit.eq(hit & r0_valid)
 702         comb += tlb_hit_way.eq(hitway)
 703
 704         with m.If(tlb_hit):
 705             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 706         with m.Else():
 707             comb += pte.eq(0)
 708         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 709         with m.If(r0.req.virt_mode):
 710             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 711                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 712                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 713             comb += perm_attr.eq(extract_perm_attr(pte))
 714         with m.Else():
 715             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 716                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 717
 718             comb += perm_attr.reference.eq(1)
 719             comb += perm_attr.changed.eq(1)
 720             comb += perm_attr.nocache.eq(0)
 721             comb += perm_attr.priv.eq(1)
 722             comb += perm_attr.rd_perm.eq(1)
 723             comb += perm_attr.wr_perm.eq(1)
 724
 725     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 726                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 727                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 728
 729         comb = m.d.comb
 730         sync = m.d.sync
 731
 732         tlbie    = Signal()
 733         tlbwe    = Signal()
 734
 735         comb += tlbie.eq(r0_valid & r0.tlbie)
 736         comb += tlbwe.eq(r0_valid & r0.tlbld)
 737
 738         m.submodules.tlb_update = d = DTLBUpdate()
 739         with m.If(tlbie & r0.doall):
 740             # clear all valid bits at once
 741             for i in range(TLB_SET_SIZE):
 742                 sync += dtlb_valid_bits[i].eq(0)
 743         with m.If(d.updated):
 744             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 745             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 746         with m.If(d.v_updated):
 747             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 748
 749         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 750
 751         comb += d.tlbie.eq(tlbie)
 752         comb += d.tlbwe.eq(tlbwe)
 753         comb += d.doall.eq(r0.doall)
 754         comb += d.tlb_hit.eq(tlb_hit)
 755         comb += d.tlb_hit_way.eq(tlb_hit_way)
 756         comb += d.tlb_tag_way.eq(tlb_tag_way)
 757         comb += d.tlb_pte_way.eq(tlb_pte_way)
 758         comb += d.tlb_req_index.eq(tlb_req_index)
 759
 760         with m.If(tlb_hit):
 761             comb += d.repl_way.eq(tlb_hit_way)
 762         with m.Else():
 763             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 764         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 765         comb += d.pte_data.eq(r0.req.data)
 766
 767     def maybe_plrus(self, m, r1, plru_victim):
 768         """Generate PLRUs
 769         """
 770         comb = m.d.comb
 771         sync = m.d.sync
 772
 773         if TLB_NUM_WAYS == 0:
 774             return
 775
 776         for i in range(NUM_LINES):
 777             # PLRU interface
 778             plru        = PLRU(WAY_BITS)
 779             setattr(m.submodules, "plru%d" % i, plru)
 780             plru_acc_en = Signal()
 781
 782             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 783             comb += plru.acc_en.eq(plru_acc_en)
 784             comb += plru.acc.eq(r1.hit_way)
 785             comb += plru_victim[i].eq(plru.lru_o)
 786
 787     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 788         """Cache tag RAM read port
 789         """
 790         comb = m.d.comb
 791         sync = m.d.sync
 792         m_in, d_in = self.m_in, self.d_in
 793
 794         index = Signal(INDEX_BITS)
 795
 796         with m.If(r0_stall):
 797             comb += index.eq(req_index)
 798         with m.Elif(m_in.valid):
 799             comb += index.eq(get_index(m_in.addr))
 800         with m.Else():
 801             comb += index.eq(get_index(d_in.addr))
 802         sync += cache_tag_set.eq(cache_tags[index])
 803
 804     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 805                        r0_valid, r1, cache_valid_bits, replace_way,
 806                        use_forward1_next, use_forward2_next,
 807                        req_hit_way, plru_victim, rc_ok, perm_attr,
 808                        valid_ra, perm_ok, access_ok, req_op, req_go,
 809                        tlb_pte_way,
 810                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 811                        cancel_store, req_same_tag, r0_stall, early_req_row):
 812         """Cache request parsing and hit detection
 813         """
 814
 815         comb = m.d.comb
 816         sync = m.d.sync
 817         m_in, d_in = self.m_in, self.d_in
 818
 819         is_hit      = Signal()
 820         hit_way     = Signal(WAY_BITS)
 821         op          = Signal(Op)
 822         opsel       = Signal(3)
 823         go          = Signal()
 824         nc          = Signal()
 825         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 826                                   for i in range(TLB_NUM_WAYS))
 827         cache_valid_idx = Signal(INDEX_BITS)
 828
 829         # Extract line, row and tag from request
 830         comb += req_index.eq(get_index(r0.req.addr))
 831         comb += req_row.eq(get_row(r0.req.addr))
 832         comb += req_tag.eq(get_tag(ra))
 833
 834         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 835         comb += cache_valid_idx.eq(cache_valid_bits[req_index])
 836
 837         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 838                                 tlb_valid_way, tlb_hit_way,
 839                                 cache_valid_idx, cache_tag_set,
 840                                 r0.req.addr,
 841                                 hit_set)
 842
 843         comb += dc.tlb_hit.eq(tlb_hit)
 844         comb += dc.reload_tag.eq(r1.reload_tag)
 845         comb += dc.virt_mode.eq(r0.req.virt_mode)
 846         comb += dc.go.eq(go)
 847         comb += dc.req_index.eq(req_index)
 848         comb += is_hit.eq(dc.is_hit)
 849         comb += hit_way.eq(dc.hit_way)
 850         comb += req_same_tag.eq(dc.rel_match)
 851
 852         # See if the request matches the line currently being reloaded
 853         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 854                   (req_index == r1.store_index) & req_same_tag):
 855             # For a store, consider this a hit even if the row isn't
 856             # valid since it will be by the time we perform the store.
 857             # For a load, check the appropriate row valid bit.
 858             valid = r1.rows_valid[req_row % ROW_PER_LINE]
 859             comb += is_hit.eq(~r0.req.load | valid)
 860             comb += hit_way.eq(replace_way)
 861
 862         # Whether to use forwarded data for a load or not
 863         with m.If((get_row(r1.req.real_addr) == req_row) &
 864                   (r1.req.hit_way == hit_way)):
 865             # Only need to consider r1.write_bram here, since if we
 866             # are writing refill data here, then we don't have a
 867             # cache hit this cycle on the line being refilled.
 868             # (There is the possibility that the load following the
 869             # load miss that started the refill could be to the old
 870             # contents of the victim line, since it is a couple of
 871             # cycles after the refill starts before we see the updated
 872             # cache tag. In that case we don't use the bypass.)
 873             comb += use_forward1_next.eq(r1.write_bram)
 874         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 875             comb += use_forward2_next.eq(r1.forward_valid1)
 876
 877         # The way that matched on a hit
 878         comb += req_hit_way.eq(hit_way)
 879
 880         # The way to replace on a miss
 881         with m.If(r1.write_tag):
 882             comb += replace_way.eq(plru_victim[r1.store_index])
 883         with m.Else():
 884             comb += replace_way.eq(r1.store_way)
 885
 886         # work out whether we have permission for this access
 887         # NB we don't yet implement AMR, thus no KUAP
 888         comb += rc_ok.eq(perm_attr.reference
 889                          & (r0.req.load | perm_attr.changed)
 890                 )
 891         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 892                            (perm_attr.wr_perm |
 893                               (r0.req.load & perm_attr.rd_perm)))
 894         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 895         # Combine the request and cache hit status to decide what
 896         # operation needs to be done
 897         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 898         comb += op.eq(Op.OP_NONE)
 899         with m.If(go):
 900             with m.If(~access_ok):
 901                 comb += op.eq(Op.OP_BAD)
 902             with m.Elif(cancel_store):
 903                 comb += op.eq(Op.OP_STCX_FAIL)
 904             with m.Else():
 905                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 906                 with m.Switch(opsel):
 907                     with m.Case(0b101):
 908                         comb += op.eq(Op.OP_LOAD_HIT)
 909                     with m.Case(0b100):
 910                         comb += op.eq(Op.OP_LOAD_MISS)
 911                     with m.Case(0b110):
 912                         comb += op.eq(Op.OP_LOAD_NC)
 913                     with m.Case(0b001):
 914                         comb += op.eq(Op.OP_STORE_HIT)
 915                     with m.Case(0b000):
 916                         comb += op.eq(Op.OP_STORE_MISS)
 917                     with m.Case(0b010):
 918                         comb += op.eq(Op.OP_STORE_MISS)
 919                     with m.Case(0b011):
 920                         comb += op.eq(Op.OP_BAD)
 921                     with m.Case(0b111):
 922                         comb += op.eq(Op.OP_BAD)
 923                     with m.Default():
 924                         comb += op.eq(Op.OP_NONE)
 925         comb += req_op.eq(op)
 926         comb += req_go.eq(go)
 927
 928         # Version of the row number that is valid one cycle earlier
 929         # in the cases where we need to read the cache data BRAM.
 930         # If we're stalling then we need to keep reading the last
 931         # row requested.
 932         with m.If(~r0_stall):
 933             with m.If(m_in.valid):
 934                 comb += early_req_row.eq(get_row(m_in.addr))
 935             with m.Else():
 936                 comb += early_req_row.eq(get_row(d_in.addr))
 937         with m.Else():
 938             comb += early_req_row.eq(req_row)
 939
 940     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 941                          r0_valid, r0, reservation):
 942         """Handle load-with-reservation and store-conditional instructions
 943         """
 944         comb = m.d.comb
 945         sync = m.d.sync
 946
 947         with m.If(r0_valid & r0.req.reserve):
 948
 949             # XXX generate alignment interrupt if address
 950             # is not aligned XXX or if r0.req.nc = '1'
 951             with m.If(r0.req.load):
 952                 comb += set_rsrv.eq(1) # load with reservation
 953             with m.Else():
 954                 comb += clear_rsrv.eq(1) # store conditional
 955                 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
 956                     comb += cancel_store.eq(1)
 957
 958     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 959                         reservation, r0):
 960
 961         comb = m.d.comb
 962         sync = m.d.sync
 963
 964         with m.If(r0_valid & access_ok):
 965             with m.If(clear_rsrv):
 966                 sync += reservation.valid.eq(0)
 967             with m.Elif(set_rsrv):
 968                 sync += reservation.valid.eq(1)
 969                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 970
 971     def writeback_control(self, m, r1, cache_out):
 972         """Return data for loads & completion control logic
 973         """
 974         comb = m.d.comb
 975         sync = m.d.sync
 976         d_out, m_out = self.d_out, self.m_out
 977
 978         data_out = Signal(64)
 979         data_fwd = Signal(64)
 980
 981         # Use the bypass if are reading the row that was
 982         # written 1 or 2 cycles ago, including for the
 983         # slow_valid = 1 case (i.e. completing a load
 984         # miss or a non-cacheable load).
 985         with m.If(r1.use_forward1):
 986             comb += data_fwd.eq(r1.forward_data1)
 987         with m.Else():
 988             comb += data_fwd.eq(r1.forward_data2)
 989
 990         comb += data_out.eq(cache_out[r1.hit_way])
 991
 992         for i in range(8):
 993             with m.If(r1.forward_sel[i]):
 994                 dsel = data_fwd.word_select(i, 8)
 995                 comb += data_out.word_select(i, 8).eq(dsel)
 996
 997         comb += d_out.valid.eq(r1.ls_valid)
 998         comb += d_out.data.eq(data_out)
 999         comb += d_out.store_done.eq(~r1.stcx_fail)
1000         comb += d_out.error.eq(r1.ls_error)
1001         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1002
1003         # Outputs to MMU
1004         comb += m_out.done.eq(r1.mmu_done)
1005         comb += m_out.err.eq(r1.mmu_error)
1006         comb += m_out.data.eq(data_out)
1007
1008         # We have a valid load or store hit or we just completed
1009         # a slow op such as a load miss, a NC load or a store
1010         #
1011         # Note: the load hit is delayed by one cycle. However it
1012         # can still not collide with r.slow_valid (well unless I
1013         # miscalculated) because slow_valid can only be set on a
1014         # subsequent request and not on its first cycle (the state
1015         # machine must have advanced), which makes slow_valid
1016         # at least 2 cycles from the previous hit_load_valid.
1017
1018         # Sanity: Only one of these must be set in any given cycle
1019
1020         if False: # TODO: need Display to get this to work
1021             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1022             "unexpected slow_valid collision with stcx_fail"
1023
1024             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1025              "unexpected hit_load_delayed collision with slow_valid"
1026
1027         with m.If(~r1.mmu_req):
1028             # Request came from loadstore1...
1029             # Load hit case is the standard path
1030             with m.If(r1.hit_load_valid):
1031                 sync += Display("completing load hit data=%x", data_out)
1032
1033             # error cases complete without stalling
1034             with m.If(r1.ls_error):
1035                 sync += Display("completing ld/st with error")
1036
1037             # Slow ops (load miss, NC, stores)
1038             with m.If(r1.slow_valid):
1039                 sync += Display("completing store or load miss data=%x",
1040                                 data_out)
1041
1042         with m.Else():
1043             # Request came from MMU
1044             with m.If(r1.hit_load_valid):
1045                 sync += Display("completing load hit to MMU, data=%x",
1046                                 m_out.data)
1047             # error cases complete without stalling
1048             with m.If(r1.mmu_error):
1049                 sync += Display("combpleting MMU ld with error")
1050
1051             # Slow ops (i.e. load miss)
1052             with m.If(r1.slow_valid):
1053                 sync += Display("completing MMU load miss, data=%x",
1054                                 m_out.data)
1055
1056     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1057         """rams
1058         Generate a cache RAM for each way. This handles the normal
1059         reads, writes from reloads and the special store-hit update
1060         path as well.
1061
1062         Note: the BRAMs have an extra read buffer, meaning the output
1063         is pipelined an extra cycle. This differs from the
1064         icache. The writeback logic needs to take that into
1065         account by using 1-cycle delayed signals for load hits.
1066         """
1067         comb = m.d.comb
1068         wb_in = self.wb_in
1069
1070         for i in range(NUM_WAYS):
1071             do_read  = Signal(name="do_rd%d" % i)
1072             rd_addr  = Signal(ROW_BITS)
1073             do_write = Signal(name="do_wr%d" % i)
1074             wr_addr  = Signal(ROW_BITS)
1075             wr_data  = Signal(WB_DATA_BITS)
1076             wr_sel   = Signal(ROW_SIZE)
1077             wr_sel_m = Signal(ROW_SIZE)
1078             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
1079
1080             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1081             setattr(m.submodules, "cacheram_%d" % i, way)
1082
1083             comb += way.rd_en.eq(do_read)
1084             comb += way.rd_addr.eq(rd_addr)
1085             comb += _d_out.eq(way.rd_data_o)
1086             comb += way.wr_sel.eq(wr_sel_m)
1087             comb += way.wr_addr.eq(wr_addr)
1088             comb += way.wr_data.eq(wr_data)
1089
1090             # Cache hit reads
1091             comb += do_read.eq(1)
1092             comb += rd_addr.eq(early_req_row[:ROW_BITS])
1093             comb += cache_out[i].eq(_d_out)
1094
1095             # Write mux:
1096             #
1097             # Defaults to wishbone read responses (cache refill)
1098             #
1099             # For timing, the mux on wr_data/sel/addr is not
1100             # dependent on anything other than the current state.
1101
1102             with m.If(r1.write_bram):
1103                 # Write store data to BRAM.  This happens one
1104                 # cycle after the store is in r0.
1105                 comb += wr_data.eq(r1.req.data)
1106                 comb += wr_sel.eq(r1.req.byte_sel)
1107                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1108
1109                 with m.If(i == r1.req.hit_way):
1110                     comb += do_write.eq(1)
1111             with m.Else():
1112                 # Otherwise, we might be doing a reload or a DCBZ
1113                 with m.If(r1.dcbz):
1114                     comb += wr_data.eq(0)
1115                 with m.Else():
1116                     comb += wr_data.eq(wb_in.dat)
1117                 comb += wr_addr.eq(r1.store_row)
1118                 comb += wr_sel.eq(~0) # all 1s
1119
1120             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1121                       & wb_in.ack & (replace_way == i)):
1122                 comb += do_write.eq(1)
1123
1124             # Mask write selects with do_write since BRAM
1125             # doesn't have a global write-enable
1126             with m.If(do_write):
1127                 comb += wr_sel_m.eq(wr_sel)
1128
1129     # Cache hit synchronous machine for the easy case.
1130     # This handles load hits.
1131     # It also handles error cases (TLB miss, cache paradox)
1132     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1133                         req_hit_way, req_index, req_tag, access_ok,
1134                         tlb_hit, tlb_hit_way, tlb_req_index):
1135
1136         comb = m.d.comb
1137         sync = m.d.sync
1138
1139         with m.If(req_op != Op.OP_NONE):
1140             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1141                     req_op, r0.req.addr, r0.req.nc,
1142                     req_index, req_tag, req_hit_way)
1143
1144         with m.If(r0_valid):
1145             sync += r1.mmu_req.eq(r0.mmu_req)
1146
1147         # Fast path for load/store hits.
1148         # Set signals for the writeback controls.
1149         sync += r1.hit_way.eq(req_hit_way)
1150         sync += r1.hit_index.eq(req_index)
1151
1152         with m.If(req_op == Op.OP_LOAD_HIT):
1153             sync += r1.hit_load_valid.eq(1)
1154         with m.Else():
1155             sync += r1.hit_load_valid.eq(0)
1156
1157         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1158             sync += r1.cache_hit.eq(1)
1159         with m.Else():
1160             sync += r1.cache_hit.eq(0)
1161
1162         with m.If(req_op == Op.OP_BAD):
1163             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1164             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1165             sync += r1.ls_error.eq(~r0.mmu_req)
1166             sync += r1.mmu_error.eq(r0.mmu_req)
1167             sync += r1.cache_paradox.eq(access_ok)
1168
1169             with m.Else():
1170                 sync += r1.ls_error.eq(0)
1171                 sync += r1.mmu_error.eq(0)
1172                 sync += r1.cache_paradox.eq(0)
1173
1174         with m.If(req_op == Op.OP_STCX_FAIL):
1175             r1.stcx_fail.eq(1)
1176         with m.Else():
1177             sync += r1.stcx_fail.eq(0)
1178
1179         # Record TLB hit information for updating TLB PLRU
1180         sync += r1.tlb_hit.eq(tlb_hit)
1181         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1182         sync += r1.tlb_hit_index.eq(tlb_req_index)
1183
1184     # Memory accesses are handled by this state machine:
1185     #
1186     #   * Cache load miss/reload (in conjunction with "rams")
1187     #   * Load hits for non-cachable forms
1188     #   * Stores (the collision case is handled in "rams")
1189     #
1190     # All wishbone requests generation is done here.
1191     # This machine operates at stage 1.
1192     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1193                     cache_valid_bits, r0, replace_way,
1194                     req_hit_way, req_same_tag,
1195                     r0_valid, req_op, cache_tags, req_go, ra):
1196
1197         comb = m.d.comb
1198         sync = m.d.sync
1199         wb_in = self.wb_in
1200
1201         req         = MemAccessRequest("mreq_ds")
1202         acks        = Signal(3)
1203         adjust_acks = Signal(3)
1204
1205         req_row = Signal(ROW_BITS)
1206         req_idx = Signal(INDEX_BITS)
1207         req_tag = Signal(TAG_BITS)
1208         comb += req_idx.eq(get_index(req.real_addr))
1209         comb += req_row.eq(get_row(req.real_addr))
1210         comb += req_tag.eq(get_tag(req.real_addr))
1211
1212         sync += r1.use_forward1.eq(use_forward1_next)
1213         sync += r1.forward_sel.eq(0)
1214
1215         with m.If(use_forward1_next):
1216             sync += r1.forward_sel.eq(r1.req.byte_sel)
1217         with m.Elif(use_forward2_next):
1218             sync += r1.forward_sel.eq(r1.forward_sel1)
1219
1220         sync += r1.forward_data2.eq(r1.forward_data1)
1221         with m.If(r1.write_bram):
1222             sync += r1.forward_data1.eq(r1.req.data)
1223             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1224             sync += r1.forward_way1.eq(r1.req.hit_way)
1225             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1226             sync += r1.forward_valid1.eq(1)
1227         with m.Else():
1228             with m.If(r1.dcbz):
1229                 sync += r1.forward_data1.eq(0)
1230             with m.Else():
1231                 sync += r1.forward_data1.eq(wb_in.dat)
1232             sync += r1.forward_sel1.eq(~0) # all 1s
1233             sync += r1.forward_way1.eq(replace_way)
1234             sync += r1.forward_row1.eq(r1.store_row)
1235             sync += r1.forward_valid1.eq(0)
1236
1237         # One cycle pulses reset
1238         sync += r1.slow_valid.eq(0)
1239         sync += r1.write_bram.eq(0)
1240         sync += r1.inc_acks.eq(0)
1241         sync += r1.dec_acks.eq(0)
1242
1243         sync += r1.ls_valid.eq(0)
1244         # complete tlbies and TLB loads in the third cycle
1245         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1246
1247         with m.If((req_op == Op.OP_LOAD_HIT)
1248                   | (req_op == Op.OP_STCX_FAIL)):
1249             with m.If(~r0.mmu_req):
1250                 sync += r1.ls_valid.eq(1)
1251             with m.Else():
1252                 sync += r1.mmu_done.eq(1)
1253
1254         with m.If(r1.write_tag):
1255             # Store new tag in selected way
1256             for i in range(NUM_WAYS):
1257                 with m.If(i == replace_way):
1258                     ct = Signal(TAG_RAM_WIDTH)
1259                     comb += ct.eq(cache_tags[r1.store_index])
1260                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1261                     sync += cache_tags[r1.store_index].eq(ct)
1262             sync += r1.store_way.eq(replace_way)
1263             sync += r1.write_tag.eq(0)
1264
1265         # Take request from r1.req if there is one there,
1266         # else from req_op, ra, etc.
1267         with m.If(r1.full):
1268             comb += req.eq(r1.req)
1269         with m.Else():
1270             comb += req.op.eq(req_op)
1271             comb += req.valid.eq(req_go)
1272             comb += req.mmu_req.eq(r0.mmu_req)
1273             comb += req.dcbz.eq(r0.req.dcbz)
1274             comb += req.real_addr.eq(ra)
1275
1276             with m.If(~r0.req.dcbz):
1277                 comb += req.data.eq(r0.req.data)
1278             with m.Else():
1279                 comb += req.data.eq(0)
1280
1281             # Select all bytes for dcbz
1282             # and for cacheable loads
1283             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1284                 comb += req.byte_sel.eq(~0) # all 1s
1285             with m.Else():
1286                 comb += req.byte_sel.eq(r0.req.byte_sel)
1287             comb += req.hit_way.eq(req_hit_way)
1288             comb += req.same_tag.eq(req_same_tag)
1289
1290             # Store the incoming request from r0,
1291             # if it is a slow request
1292             # Note that r1.full = 1 implies req_op = OP_NONE
1293             with m.If((req_op == Op.OP_LOAD_MISS)
1294                       | (req_op == Op.OP_LOAD_NC)
1295                       | (req_op == Op.OP_STORE_MISS)
1296                       | (req_op == Op.OP_STORE_HIT)):
1297                 sync += r1.req.eq(req)
1298                 sync += r1.full.eq(1)
1299
1300         # Main state machine
1301         with m.Switch(r1.state):
1302
1303             with m.Case(State.IDLE):
1304                 sync += r1.wb.adr.eq(req.real_addr)
1305                 sync += r1.wb.sel.eq(req.byte_sel)
1306                 sync += r1.wb.dat.eq(req.data)
1307                 sync += r1.dcbz.eq(req.dcbz)
1308
1309                 # Keep track of our index and way
1310                 # for subsequent stores.
1311                 sync += r1.store_index.eq(req_idx)
1312                 sync += r1.store_row.eq(req_row)
1313                 sync += r1.end_row_ix.eq(get_row_of_line(req_row))
1314                 sync += r1.reload_tag.eq(req_tag)
1315                 sync += r1.req.same_tag.eq(1)
1316
1317                 with m.If(req.op == Op.OP_STORE_HIT):
1318                     sync += r1.store_way.eq(req.hit_way)
1319
1320                 # Reset per-row valid bits,
1321                 # ready for handling OP_LOAD_MISS
1322                 for i in range(ROW_PER_LINE):
1323                     sync += r1.rows_valid[i].eq(0)
1324
1325                 with m.If(req_op != Op.OP_NONE):
1326                     sync += Display("cache op %d", req.op)
1327
1328                 with m.Switch(req.op):
1329                     with m.Case(Op.OP_LOAD_HIT):
1330                         # stay in IDLE state
1331                         pass
1332
1333                     with m.Case(Op.OP_LOAD_MISS):
1334                         sync += Display("cache miss real addr: %x " \
1335                                 "idx: %x tag: %x",
1336                                 req.real_addr, req_row, req_tag)
1337
1338                         # Start the wishbone cycle
1339                         sync += r1.wb.we.eq(0)
1340                         sync += r1.wb.cyc.eq(1)
1341                         sync += r1.wb.stb.eq(1)
1342
1343                         # Track that we had one request sent
1344                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1345                         sync += r1.write_tag.eq(1)
1346
1347                     with m.Case(Op.OP_LOAD_NC):
1348                         sync += r1.wb.cyc.eq(1)
1349                         sync += r1.wb.stb.eq(1)
1350                         sync += r1.wb.we.eq(0)
1351                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1352
1353                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1354                         with m.If(~req.dcbz):
1355                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1356                             sync += r1.acks_pending.eq(1)
1357                             sync += r1.full.eq(0)
1358                             sync += r1.slow_valid.eq(1)
1359
1360                             with m.If(~req.mmu_req):
1361                                 sync += r1.ls_valid.eq(1)
1362                             with m.Else():
1363                                 sync += r1.mmu_done.eq(1)
1364
1365                             with m.If(req.op == Op.OP_STORE_HIT):
1366                                 sync += r1.write_bram.eq(1)
1367                         with m.Else():
1368                             # dcbz is handled much like a load miss except
1369                             # that we are writing to memory instead of reading
1370                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1371
1372                             with m.If(req.op == Op.OP_STORE_MISS):
1373                                 sync += r1.write_tag.eq(1)
1374
1375                         sync += r1.wb.we.eq(1)
1376                         sync += r1.wb.cyc.eq(1)
1377                         sync += r1.wb.stb.eq(1)
1378
1379                     # OP_NONE and OP_BAD do nothing
1380                     # OP_BAD & OP_STCX_FAIL were
1381                     # handled above already
1382                     with m.Case(Op.OP_NONE):
1383                         pass
1384                     with m.Case(Op.OP_BAD):
1385                         pass
1386                     with m.Case(Op.OP_STCX_FAIL):
1387                         pass
1388
1389             with m.Case(State.RELOAD_WAIT_ACK):
1390                 ld_stbs_done = Signal()
1391                 # Requests are all sent if stb is 0
1392                 comb += ld_stbs_done.eq(~r1.wb.stb)
1393
1394                 with m.If((~wb_in.stall) & r1.wb.stb):
1395                     # That was the last word?
1396                     # We are done sending.
1397                     # Clear stb and set ld_stbs_done
1398                     # so we can handle an eventual
1399                     # last ack on the same cycle.
1400                     with m.If(is_last_row_addr(r1.wb.adr, r1.end_row_ix)):
1401                         sync += r1.wb.stb.eq(0)
1402                         comb += ld_stbs_done.eq(1)
1403
1404                     # Calculate the next row address in the current cache line
1405                     rarange = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1406                     comb += rarange.eq(r1.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS]+1)
1407                     sync += r1.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
1408
1409                 # Incoming acks processing
1410                 sync += r1.forward_valid1.eq(wb_in.ack)
1411                 with m.If(wb_in.ack):
1412                     sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1413
1414                     # If this is the data we were looking for,
1415                     # we can complete the request next cycle.
1416                     # Compare the whole address in case the
1417                     # request in r1.req is not the one that
1418                     # started this refill.
1419                     with m.If(r1.full & r1.req.same_tag &
1420                               ((r1.dcbz & r1.req.dcbz) |
1421                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1422                                 (r1.store_row == get_row(r1.req.real_addr))):
1423                         sync += r1.full.eq(0)
1424                         sync += r1.slow_valid.eq(1)
1425                         with m.If(~r1.mmu_req):
1426                             sync += r1.ls_valid.eq(1)
1427                         with m.Else():
1428                             sync += r1.mmu_done.eq(1)
1429                         sync += r1.forward_sel.eq(~0) # all 1s
1430                         sync += r1.use_forward1.eq(1)
1431
1432                     # Check for completion
1433                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1434                                                       r1.end_row_ix)):
1435                         # Complete wishbone cycle
1436                         sync += r1.wb.cyc.eq(0)
1437
1438                         # Cache line is now valid
1439                         cv = Signal(INDEX_BITS)
1440                         comb += cv.eq(cache_valid_bits[r1.store_index])
1441                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1442                         sync += cache_valid_bits[r1.store_index].eq(cv)
1443                         sync += r1.state.eq(State.IDLE)
1444
1445                     # Increment store row counter
1446                     sync += r1.store_row.eq(next_row(r1.store_row))
1447
1448             with m.Case(State.STORE_WAIT_ACK):
1449                 st_stbs_done = Signal()
1450                 comb += st_stbs_done.eq(~r1.wb.stb)
1451                 comb += acks.eq(r1.acks_pending)
1452
1453                 with m.If(r1.inc_acks != r1.dec_acks):
1454                     with m.If(r1.inc_acks):
1455                         comb += adjust_acks.eq(acks + 1)
1456                     with m.Else():
1457                         comb += adjust_acks.eq(acks - 1)
1458                 with m.Else():
1459                     comb += adjust_acks.eq(acks)
1460
1461                 sync += r1.acks_pending.eq(adjust_acks)
1462
1463                 # Clear stb when slave accepted request
1464                 with m.If(~wb_in.stall):
1465                     # See if there is another store waiting
1466                     # to be done which is in the same real page.
1467                     with m.If(req.valid):
1468                         ra = req.real_addr[0:SET_SIZE_BITS]
1469                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1470                         sync += r1.wb.dat.eq(req.data)
1471                         sync += r1.wb.sel.eq(req.byte_sel)
1472
1473                     with m.Elif((adjust_acks < 7) & req.same_tag &
1474                                 ((req.op == Op.OP_STORE_MISS)
1475                                  | (req.op == Op.OP_STORE_HIT))):
1476                         sync += r1.wb.stb.eq(1)
1477                         comb += st_stbs_done.eq(0)
1478
1479                         with m.If(req.op == Op.OP_STORE_HIT):
1480                             sync += r1.write_bram.eq(1)
1481                         sync += r1.full.eq(0)
1482                         sync += r1.slow_valid.eq(1)
1483
1484                         # Store requests never come from the MMU
1485                         sync += r1.ls_valid.eq(1)
1486                         comb += st_stbs_done.eq(0)
1487                         sync += r1.inc_acks.eq(1)
1488                     with m.Else():
1489                         sync += r1.wb.stb.eq(0)
1490                         comb += st_stbs_done.eq(1)
1491
1492                 # Got ack ? See if complete.
1493                 with m.If(wb_in.ack):
1494                     with m.If(st_stbs_done & (adjust_acks == 1)):
1495                         sync += r1.state.eq(State.IDLE)
1496                         sync += r1.wb.cyc.eq(0)
1497                         sync += r1.wb.stb.eq(0)
1498                     sync += r1.dec_acks.eq(1)
1499
1500             with m.Case(State.NC_LOAD_WAIT_ACK):
1501                 # Clear stb when slave accepted request
1502                 with m.If(~wb_in.stall):
1503                     sync += r1.wb.stb.eq(0)
1504
1505                 # Got ack ? complete.
1506                 with m.If(wb_in.ack):
1507                     sync += r1.state.eq(State.IDLE)
1508                     sync += r1.full.eq(0)
1509                     sync += r1.slow_valid.eq(1)
1510
1511                     with m.If(~r1.mmu_req):
1512                         sync += r1.ls_valid.eq(1)
1513                     with m.Else():
1514                         sync += r1.mmu_done.eq(1)
1515
1516                     sync += r1.forward_sel.eq(~0) # all 1s
1517                     sync += r1.use_forward1.eq(1)
1518                     sync += r1.wb.cyc.eq(0)
1519                     sync += r1.wb.stb.eq(0)
1520
1521     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1522
1523         sync = m.d.sync
1524         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1525
1526         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1527                                stall_out, req_op[:3], d_out.valid, d_out.error,
1528                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1529                                r1.wb.adr[3:6]))
1530
1531     def elaborate(self, platform):
1532
1533         m = Module()
1534         comb = m.d.comb
1535
1536         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1537         cache_tags       = CacheTagArray()
1538         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1539         cache_valid_bits = CacheValidBitsArray()
1540
1541         # TODO attribute ram_style : string;
1542         # TODO attribute ram_style of cache_tags : signal is "distributed";
1543
1544         """note: these are passed to nmigen.hdl.Memory as "attributes".
1545            don't know how, just that they are.
1546         """
1547         dtlb_valid_bits = TLBValidBitsArray()
1548         dtlb_tags       = TLBTagsArray()
1549         dtlb_ptes       = TLBPtesArray()
1550         # TODO attribute ram_style of
1551         #  dtlb_tags : signal is "distributed";
1552         # TODO attribute ram_style of
1553         #  dtlb_ptes : signal is "distributed";
1554
1555         r0      = RegStage0("r0")
1556         r0_full = Signal()
1557
1558         r1 = RegStage1("r1")
1559
1560         reservation = Reservation()
1561
1562         # Async signals on incoming request
1563         req_index    = Signal(INDEX_BITS)
1564         req_row      = Signal(ROW_BITS)
1565         req_hit_way  = Signal(WAY_BITS)
1566         req_tag      = Signal(TAG_BITS)
1567         req_op       = Signal(Op)
1568         req_data     = Signal(64)
1569         req_same_tag = Signal()
1570         req_go       = Signal()
1571
1572         early_req_row     = Signal(ROW_BITS)
1573
1574         cancel_store      = Signal()
1575         set_rsrv          = Signal()
1576         clear_rsrv        = Signal()
1577
1578         r0_valid          = Signal()
1579         r0_stall          = Signal()
1580
1581         use_forward1_next = Signal()
1582         use_forward2_next = Signal()
1583
1584         cache_out         = CacheRamOut()
1585
1586         plru_victim       = PLRUOut()
1587         replace_way       = Signal(WAY_BITS)
1588
1589         # Wishbone read/write/cache write formatting signals
1590         bus_sel           = Signal(8)
1591
1592         # TLB signals
1593         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1594         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1595         tlb_valid_way = Signal(TLB_NUM_WAYS)
1596         tlb_req_index = Signal(TLB_SET_BITS)
1597         tlb_hit       = Signal()
1598         tlb_hit_way   = Signal(TLB_WAY_BITS)
1599         pte           = Signal(TLB_PTE_BITS)
1600         ra            = Signal(REAL_ADDR_BITS)
1601         valid_ra      = Signal()
1602         perm_attr     = PermAttr("dc_perms")
1603         rc_ok         = Signal()
1604         perm_ok       = Signal()
1605         access_ok     = Signal()
1606
1607         tlb_plru_victim = TLBPLRUOut()
1608
1609         # we don't yet handle collisions between loadstore1 requests
1610         # and MMU requests
1611         comb += self.m_out.stall.eq(0)
1612
1613         # Hold off the request in r0 when r1 has an uncompleted request
1614         comb += r0_stall.eq(r0_full & r1.full)
1615         comb += r0_valid.eq(r0_full & ~r1.full)
1616         comb += self.stall_out.eq(r0_stall)
1617
1618         # Wire up wishbone request latch out of stage 1
1619         comb += self.wb_out.eq(r1.wb)
1620
1621         # call sub-functions putting everything together, using shared
1622         # signals established above
1623         self.stage_0(m, r0, r1, r0_full)
1624         self.tlb_read(m, r0_stall, tlb_valid_way,
1625                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1626                       dtlb_tags, dtlb_ptes)
1627         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1628                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1629                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1630         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1631                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1632                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1633         self.maybe_plrus(m, r1, plru_victim)
1634         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1635         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1636         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1637                            r0_valid, r1, cache_valid_bits, replace_way,
1638                            use_forward1_next, use_forward2_next,
1639                            req_hit_way, plru_victim, rc_ok, perm_attr,
1640                            valid_ra, perm_ok, access_ok, req_op, req_go,
1641                            tlb_pte_way,
1642                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1643                            cancel_store, req_same_tag, r0_stall, early_req_row)
1644         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1645                            r0_valid, r0, reservation)
1646         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1647                            reservation, r0)
1648         self.writeback_control(m, r1, cache_out)
1649         self.rams(m, r1, early_req_row, cache_out, replace_way)
1650         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1651                         req_hit_way, req_index, req_tag, access_ok,
1652                         tlb_hit, tlb_hit_way, tlb_req_index)
1653         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1654                     cache_valid_bits, r0, replace_way,
1655                     req_hit_way, req_same_tag,
1656                          r0_valid, req_op, cache_tags, req_go, ra)
1657         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1658
1659         return m
1660
1661 def dcache_load(dut, addr, nc=0):
1662     yield dut.d_in.load.eq(1)
1663     yield dut.d_in.nc.eq(nc)
1664     yield dut.d_in.addr.eq(addr)
1665     yield dut.d_in.valid.eq(1)
1666     yield
1667     yield dut.d_in.valid.eq(0)
1668     yield
1669     while not (yield dut.d_out.valid):
1670         yield
1671     data = yield dut.d_out.data
1672     return data
1673
1674
1675 def dcache_store(dut, addr, data, nc=0):
1676     yield dut.d_in.load.eq(0)
1677     yield dut.d_in.nc.eq(nc)
1678     yield dut.d_in.data.eq(data)
1679     yield dut.d_in.byte_sel.eq(~0)
1680     yield dut.d_in.addr.eq(addr)
1681     yield dut.d_in.valid.eq(1)
1682     yield
1683     yield dut.d_in.valid.eq(0)
1684     yield dut.d_in.byte_sel.eq(0)
1685     yield
1686     while not (yield dut.d_out.valid):
1687         yield
1688
1689
1690 def dcache_random_sim(dut):
1691
1692     # start with stack of zeros
1693     sim_mem = [0] * 512
1694
1695     # clear stuff
1696     yield dut.d_in.valid.eq(0)
1697     yield dut.d_in.load.eq(0)
1698     yield dut.d_in.priv_mode.eq(1)
1699     yield dut.d_in.nc.eq(0)
1700     yield dut.d_in.addr.eq(0)
1701     yield dut.d_in.data.eq(0)
1702     yield dut.m_in.valid.eq(0)
1703     yield dut.m_in.addr.eq(0)
1704     yield dut.m_in.pte.eq(0)
1705     # wait 4 * clk_period
1706     yield
1707     yield
1708     yield
1709     yield
1710
1711     print ()
1712
1713     for i in range(256):
1714         addr = randint(0, 255)
1715         data = randint(0, (1<<64)-1)
1716         sim_mem[addr] = data
1717         addr *= 8
1718
1719         print ("testing %x data %x" % (addr, data))
1720
1721         yield from dcache_load(dut, addr)
1722         yield from dcache_store(dut, addr, data)
1723
1724         addr = randint(0, 255)
1725         sim_data = sim_mem[addr]
1726         addr *= 8
1727
1728         data = yield from dcache_load(dut, addr)
1729         assert data == sim_data, \
1730             "check %x data %x != %x" % (addr, data, sim_data)
1731
1732     for addr in range(8):
1733         data = yield from dcache_load(dut, addr*8)
1734         assert data == sim_mem[addr], \
1735             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1736
1737 def dcache_sim(dut):
1738     # clear stuff
1739     yield dut.d_in.valid.eq(0)
1740     yield dut.d_in.load.eq(0)
1741     yield dut.d_in.priv_mode.eq(1)
1742     yield dut.d_in.nc.eq(0)
1743     yield dut.d_in.addr.eq(0)
1744     yield dut.d_in.data.eq(0)
1745     yield dut.m_in.valid.eq(0)
1746     yield dut.m_in.addr.eq(0)
1747     yield dut.m_in.pte.eq(0)
1748     # wait 4 * clk_period
1749     yield
1750     yield
1751     yield
1752     yield
1753
1754     # Cacheable read of address 4
1755     data = yield from dcache_load(dut, 0x4)
1756     addr = yield dut.d_in.addr
1757     assert data == 0x0000000100000000, \
1758         f"data @%x=%x expected 0x0000000100000000" % (addr, data)
1759
1760     yield
1761     yield
1762     yield
1763     yield
1764
1765     yield
1766     yield
1767     yield
1768     yield
1769
1770     # Cacheable read of address 20
1771     data = yield from dcache_load(dut, 0x20)
1772     addr = yield dut.d_in.addr
1773     assert data == 0x0000000100000000, \
1774         f"data @%x=%x expected 0x0000000100000000" % (addr, data)
1775
1776     # Cacheable read of address 30
1777     data = yield from dcache_load(dut, 0x530)
1778     addr = yield dut.d_in.addr
1779     assert data == 0x0000014D0000014C, \
1780         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1781
1782     # 2nd Cacheable read of address 30
1783     data = yield from dcache_load(dut, 0x530)
1784     addr = yield dut.d_in.addr
1785     assert data == 0x0000014D0000014C, \
1786         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1787
1788     # Non-cacheable read of address 100
1789     data = yield from dcache_load(dut, 0x100, nc=1)
1790     addr = yield dut.d_in.addr
1791     assert data == 0x0000004100000040, \
1792         f"data @%x=%x expected 0000004100000040" % (addr, data)
1793
1794     # Store at address 530
1795     yield from dcache_store(dut, 0x530, 0x121)
1796
1797     # Store at address 30
1798     yield from dcache_store(dut, 0x530, 0x12345678)
1799
1800     # 3nd Cacheable read of address 530
1801     data = yield from dcache_load(dut, 0x530)
1802     addr = yield dut.d_in.addr
1803     assert data == 0x12345678, \
1804         f"data @%x=%x expected 0x12345678" % (addr, data)
1805
1806     # 4th Cacheable read of address 30
1807     data = yield from dcache_load(dut, 0x20)
1808     addr = yield dut.d_in.addr
1809     assert data == 0x12345678, \
1810         f"data @%x=%x expected 0x12345678" % (addr, data)
1811
1812     yield
1813     yield
1814     yield
1815     yield
1816
1817
1818 def test_dcache(mem, test_fn, test_name):
1819     dut = DCache()
1820
1821     memory = Memory(width=64, depth=16*64, init=mem)
1822     sram = SRAM(memory=memory, granularity=8)
1823
1824     m = Module()
1825     m.submodules.dcache = dut
1826     m.submodules.sram = sram
1827
1828     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1829     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1830     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1831     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1832     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr[3:])
1833     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1834
1835     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1836     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1837
1838     # nmigen Simulation
1839     sim = Simulator(m)
1840     sim.add_clock(1e-6)
1841
1842     sim.add_sync_process(wrap(test_fn(dut)))
1843     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1844         sim.run()
1845
1846 if __name__ == '__main__':
1847     dut = DCache()
1848     vl = rtlil.convert(dut, ports=[])
1849     with open("test_dcache.il", "w") as f:
1850         f.write(vl)
1851
1852     mem = []
1853     for i in range(0,512):
1854         mem.append((i*2)| ((i*2+1)<<32))
1855
1856     test_dcache(mem, dcache_sim, "")
1857     #test_dcache(None, dcache_random_sim, "random")
1858