src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 from nmutil.util import Display
  11
  12 from random import randint
  13
  14 from nmigen.cli import main
  15 from nmutil.iocontrol import RecordObject
  16 from nmigen.utils import log2_int
  17 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  18                                      DCacheToLoadStore1Type,
  19                                      MMUToDCacheType,
  20                                      DCacheToMMUType)
  21
  22 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  23                                 WBAddrType, WBDataType, WBSelType,
  24                                 WBMasterOut, WBSlaveOut,
  25                                 WBMasterOutVector, WBSlaveOutVector,
  26                                 WBIOMasterOut, WBIOSlaveOut)
  27
  28 from soc.experiment.cache_ram import CacheRam
  29 from soc.experiment.plru import PLRU
  30
  31 # for test
  32 from nmigen_soc.wishbone.sram import SRAM
  33 from nmigen import Memory
  34 from nmigen.cli import rtlil
  35 if True:
  36     from nmigen.back.pysim import Simulator, Delay, Settle
  37 else:
  38     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  39 from nmutil.util import wrap
  40
  41
  42 # TODO: make these parameters of DCache at some point
  43 LINE_SIZE = 64    # Line size in bytes
  44 NUM_LINES = 16    # Number of lines in a set
  45 NUM_WAYS = 4      # Number of ways
  46 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  47 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  48 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  49 LOG_LENGTH = 0    # Non-zero to enable log data collection
  50
  51 # BRAM organisation: We never access more than
  52 #     -- WB_DATA_BITS at a time so to save
  53 #     -- resources we make the array only that wide, and
  54 #     -- use consecutive indices for to make a cache "line"
  55 #     --
  56 #     -- ROW_SIZE is the width in bytes of the BRAM
  57 #     -- (based on WB, so 64-bits)
  58 ROW_SIZE = WB_DATA_BITS // 8;
  59
  60 # ROW_PER_LINE is the number of row (wishbone
  61 # transactions) in a line
  62 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  63
  64 # BRAM_ROWS is the number of rows in BRAM needed
  65 # to represent the full dcache
  66 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  67
  68 print ("ROW_SIZE", ROW_SIZE)
  69 print ("ROW_PER_LINE", ROW_PER_LINE)
  70 print ("BRAM_ROWS", BRAM_ROWS)
  71 print ("NUM_WAYS", NUM_WAYS)
  72
  73 # Bit fields counts in the address
  74
  75 # REAL_ADDR_BITS is the number of real address
  76 # bits that we store
  77 REAL_ADDR_BITS = 56
  78
  79 # ROW_BITS is the number of bits to select a row
  80 ROW_BITS = log2_int(BRAM_ROWS)
  81
  82 # ROW_LINE_BITS is the number of bits to select
  83 # a row within a line
  84 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  85
  86 # LINE_OFF_BITS is the number of bits for
  87 # the offset in a cache line
  88 LINE_OFF_BITS = log2_int(LINE_SIZE)
  89
  90 # ROW_OFF_BITS is the number of bits for
  91 # the offset in a row
  92 ROW_OFF_BITS = log2_int(ROW_SIZE)
  93
  94 # INDEX_BITS is the number if bits to
  95 # select a cache line
  96 INDEX_BITS = log2_int(NUM_LINES)
  97
  98 # SET_SIZE_BITS is the log base 2 of the set size
  99 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 100
 101 # TAG_BITS is the number of bits of
 102 # the tag part of the address
 103 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 104
 105 # TAG_WIDTH is the width in bits of each way of the tag RAM
 106 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 107
 108 # WAY_BITS is the number of bits to select a way
 109 WAY_BITS = log2_int(NUM_WAYS)
 110
 111 # Example of layout for 32 lines of 64 bytes:
 112 layout = """\
 113   ..  tag    |index|  line  |
 114   ..         |   row   |    |
 115   ..         |     |---|    | ROW_LINE_BITS  (3)
 116   ..         |     |--- - --| LINE_OFF_BITS (6)
 117   ..         |         |- --| ROW_OFF_BITS  (3)
 118   ..         |----- ---|    | ROW_BITS      (8)
 119   ..         |-----|        | INDEX_BITS    (5)
 120   .. --------|              | TAG_BITS      (45)
 121 """
 122 print (layout)
 123 print ("Dcache TAG %d IDX %d ROW %d ROFF %d LOFF %d RLB %d" % \
 124             (TAG_BITS, INDEX_BITS, ROW_BITS,
 125              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 126 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 127 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 128 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 129
 130 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 131
 132 def CacheTagArray():
 133     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 134                         for x in range(NUM_LINES))
 135
 136 def CacheValidBitsArray():
 137     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 138                         for x in range(NUM_LINES))
 139
 140 def RowPerLineValidArray():
 141     return Array(Signal(name="rows_valid%d" % x) \
 142                         for x in range(ROW_PER_LINE))
 143
 144 # L1 TLB
 145 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 146 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 147 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 148 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 149 TLB_PTE_BITS     = 64
 150 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 151
 152 def ispow2(x):
 153     return (1<<log2_int(x, False)) == x
 154
 155 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 156 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 157 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 158 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 159 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 160 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 161         "geometry bits don't add up"
 162 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 163         "geometry bits don't add up"
 164 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 165          "geometry bits don't add up"
 166 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 167 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 168
 169
 170 def TLBValidBitsArray():
 171     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 172
 173 def TLBTagEAArray():
 174     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 175
 176 def TLBTagsArray():
 177     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 178
 179 def TLBPtesArray():
 180     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 181
 182 def HitWaySet():
 183     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 184                         for x in range(TLB_NUM_WAYS))
 185
 186 # Cache RAM interface
 187 def CacheRamOut():
 188     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 189                  for x in range(NUM_WAYS))
 190
 191 # PLRU output interface
 192 def PLRUOut():
 193     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 194
 195 # TLB PLRU output interface
 196 def TLBPLRUOut():
 197     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 198
 199 # Helper functions to decode incoming requests
 200 #
 201 # Return the cache line index (tag index) for an address
 202 def get_index(addr):
 203     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 204
 205 # Return the cache row index (data memory) for an address
 206 def get_row(addr):
 207     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 208
 209 # Return the index of a row within a line
 210 def get_row_of_line(row):
 211     return row[:ROW_BITS][:ROW_LINE_BITS]
 212
 213 # Returns whether this is the last row of a line
 214 def is_last_row_addr(addr, last):
 215     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 216
 217 # Returns whether this is the last row of a line
 218 def is_last_row(row, last):
 219     return get_row_of_line(row) == last
 220
 221 # Return the next row in the current cache line. We use a
 222 # dedicated function in order to limit the size of the
 223 # generated adder to be only the bits within a cache line
 224 # (3 bits with default settings)
 225 def next_row(row):
 226     row_v = row[0:ROW_LINE_BITS] + 1
 227     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 228
 229 # Get the tag value from the address
 230 def get_tag(addr):
 231     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 232
 233 # Read a tag from a tag memory row
 234 def read_tag(way, tagset):
 235     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 236
 237 # Read a TLB tag from a TLB tag memory row
 238 def read_tlb_tag(way, tags):
 239     return tags.word_select(way, TLB_EA_TAG_BITS)
 240
 241 # Write a TLB tag to a TLB tag memory row
 242 def write_tlb_tag(way, tags, tag):
 243     return read_tlb_tag(way, tags).eq(tag)
 244
 245 # Read a PTE from a TLB PTE memory row
 246 def read_tlb_pte(way, ptes):
 247     return ptes.word_select(way, TLB_PTE_BITS)
 248
 249 def write_tlb_pte(way, ptes, newpte):
 250     return read_tlb_pte(way, ptes).eq(newpte)
 251
 252
 253 # Record for storing permission, attribute, etc. bits from a PTE
 254 class PermAttr(RecordObject):
 255     def __init__(self, name=None):
 256         super().__init__(name=name)
 257         self.reference = Signal()
 258         self.changed   = Signal()
 259         self.nocache   = Signal()
 260         self.priv      = Signal()
 261         self.rd_perm   = Signal()
 262         self.wr_perm   = Signal()
 263
 264
 265 def extract_perm_attr(pte):
 266     pa = PermAttr()
 267     pa.reference = pte[8]
 268     pa.changed   = pte[7]
 269     pa.nocache   = pte[5]
 270     pa.priv      = pte[3]
 271     pa.rd_perm   = pte[2]
 272     pa.wr_perm   = pte[1]
 273     return pa;
 274
 275
 276 # Type of operation on a "valid" input
 277 @unique
 278 class Op(Enum):
 279     OP_NONE       = 0
 280     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 281     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 282     OP_LOAD_HIT   = 3 # Cache hit on load
 283     OP_LOAD_MISS  = 4 # Load missing cache
 284     OP_LOAD_NC    = 5 # Non-cachable load
 285     OP_STORE_HIT  = 6 # Store hitting cache
 286     OP_STORE_MISS = 7 # Store missing cache
 287
 288
 289 # Cache state machine
 290 @unique
 291 class State(Enum):
 292     IDLE             = 0 # Normal load hit processing
 293     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 294     STORE_WAIT_ACK   = 2 # Store wait ack
 295     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 296
 297
 298 # Dcache operations:
 299 #
 300 # In order to make timing, we use the BRAMs with
 301 # an output buffer, which means that the BRAM
 302 # output is delayed by an extra cycle.
 303 #
 304 # Thus, the dcache has a 2-stage internal pipeline
 305 # for cache hits with no stalls.
 306 #
 307 # All other operations are handled via stalling
 308 # in the first stage.
 309 #
 310 # The second stage can thus complete a hit at the same
 311 # time as the first stage emits a stall for a complex op.
 312 #
 313 # Stage 0 register, basically contains just the latched request
 314
 315 class RegStage0(RecordObject):
 316     def __init__(self, name=None):
 317         super().__init__(name=name)
 318         self.req     = LoadStore1ToDCacheType(name="lsmem")
 319         self.tlbie   = Signal()
 320         self.doall   = Signal()
 321         self.tlbld   = Signal()
 322         self.mmu_req = Signal() # indicates source of request
 323
 324
 325 class MemAccessRequest(RecordObject):
 326     def __init__(self, name=None):
 327         super().__init__(name=name)
 328         self.op        = Signal(Op)
 329         self.valid     = Signal()
 330         self.dcbz      = Signal()
 331         self.real_addr = Signal(REAL_ADDR_BITS)
 332         self.data      = Signal(64)
 333         self.byte_sel  = Signal(8)
 334         self.hit_way   = Signal(WAY_BITS)
 335         self.same_tag  = Signal()
 336         self.mmu_req   = Signal()
 337
 338
 339 # First stage register, contains state for stage 1 of load hits
 340 # and for the state machine used by all other operations
 341 class RegStage1(RecordObject):
 342     def __init__(self, name=None):
 343         super().__init__(name=name)
 344         # Info about the request
 345         self.full             = Signal() # have uncompleted request
 346         self.mmu_req          = Signal() # request is from MMU
 347         self.req              = MemAccessRequest(name="reqmem")
 348
 349         # Cache hit state
 350         self.hit_way          = Signal(WAY_BITS)
 351         self.hit_load_valid   = Signal()
 352         self.hit_index        = Signal(INDEX_BITS)
 353         self.cache_hit        = Signal()
 354
 355         # TLB hit state
 356         self.tlb_hit          = Signal()
 357         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 358         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 359
 360         # 2-stage data buffer for data forwarded from writes to reads
 361         self.forward_data1    = Signal(64)
 362         self.forward_data2    = Signal(64)
 363         self.forward_sel1     = Signal(8)
 364         self.forward_valid1   = Signal()
 365         self.forward_way1     = Signal(WAY_BITS)
 366         self.forward_row1     = Signal(ROW_BITS)
 367         self.use_forward1     = Signal()
 368         self.forward_sel      = Signal(8)
 369
 370         # Cache miss state (reload state machine)
 371         self.state            = Signal(State)
 372         self.dcbz             = Signal()
 373         self.write_bram       = Signal()
 374         self.write_tag        = Signal()
 375         self.slow_valid       = Signal()
 376         self.wb               = WBMasterOut("wb")
 377         self.reload_tag       = Signal(TAG_BITS)
 378         self.store_way        = Signal(WAY_BITS)
 379         self.store_row        = Signal(ROW_BITS)
 380         self.store_index      = Signal(INDEX_BITS)
 381         self.end_row_ix       = Signal(ROW_LINE_BITS)
 382         self.rows_valid       = RowPerLineValidArray()
 383         self.acks_pending     = Signal(3)
 384         self.inc_acks         = Signal()
 385         self.dec_acks         = Signal()
 386
 387         # Signals to complete (possibly with error)
 388         self.ls_valid         = Signal()
 389         self.ls_error         = Signal()
 390         self.mmu_done         = Signal()
 391         self.mmu_error        = Signal()
 392         self.cache_paradox    = Signal()
 393
 394         # Signal to complete a failed stcx.
 395         self.stcx_fail        = Signal()
 396
 397
 398 # Reservation information
 399 class Reservation(RecordObject):
 400     def __init__(self):
 401         super().__init__()
 402         self.valid = Signal()
 403         self.addr  = Signal(64-LINE_OFF_BITS)
 404
 405
 406 class DTLBUpdate(Elaboratable):
 407     def __init__(self):
 408         self.tlbie    = Signal()
 409         self.tlbwe    = Signal()
 410         self.doall    = Signal()
 411         self.updated  = Signal()
 412         self.v_updated  = Signal()
 413         self.tlb_hit    = Signal()
 414         self.tlb_req_index = Signal(TLB_SET_BITS)
 415
 416         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 417         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 418         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 419         self.repl_way        = Signal(TLB_WAY_BITS)
 420         self.eatag           = Signal(TLB_EA_TAG_BITS)
 421         self.pte_data        = Signal(TLB_PTE_BITS)
 422
 423         self.dv = Signal(TLB_PTE_WAY_BITS)
 424
 425         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 426         self.pb_out = Signal(TLB_NUM_WAYS)
 427         self.db_out = Signal(TLB_PTE_WAY_BITS)
 428
 429     def elaborate(self, platform):
 430         m = Module()
 431         comb = m.d.comb
 432         sync = m.d.sync
 433
 434         tagset   = Signal(TLB_TAG_WAY_BITS)
 435         pteset   = Signal(TLB_PTE_WAY_BITS)
 436
 437         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 438
 439         with m.If(self.tlbie & self.doall):
 440             pass # clear all back in parent
 441         with m.Elif(self.tlbie):
 442             with m.If(self.tlb_hit):
 443                 comb += db_out.eq(self.dv)
 444                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 445                 comb += self.v_updated.eq(1)
 446
 447         with m.Elif(self.tlbwe):
 448
 449             comb += tagset.eq(self.tlb_tag_way)
 450             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 451             comb += tb_out.eq(tagset)
 452
 453             comb += pteset.eq(self.tlb_pte_way)
 454             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 455             comb += pb_out.eq(pteset)
 456
 457             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 458
 459             comb += self.updated.eq(1)
 460             comb += self.v_updated.eq(1)
 461
 462         return m
 463
 464
 465 class DCachePendingHit(Elaboratable):
 466
 467     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 468                       cache_valid_idx, cache_tag_set,
 469                     req_addr,
 470                     hit_set):
 471
 472         self.go          = Signal()
 473         self.virt_mode   = Signal()
 474         self.is_hit      = Signal()
 475         self.tlb_hit     = Signal()
 476         self.hit_way     = Signal(WAY_BITS)
 477         self.rel_match   = Signal()
 478         self.req_index   = Signal(INDEX_BITS)
 479         self.reload_tag  = Signal(TAG_BITS)
 480
 481         self.tlb_hit_way = tlb_hit_way
 482         self.tlb_pte_way = tlb_pte_way
 483         self.tlb_valid_way = tlb_valid_way
 484         self.cache_valid_idx = cache_valid_idx
 485         self.cache_tag_set = cache_tag_set
 486         self.req_addr = req_addr
 487         self.hit_set = hit_set
 488
 489     def elaborate(self, platform):
 490         m = Module()
 491         comb = m.d.comb
 492         sync = m.d.sync
 493
 494         go = self.go
 495         virt_mode = self.virt_mode
 496         is_hit = self.is_hit
 497         tlb_pte_way = self.tlb_pte_way
 498         tlb_valid_way = self.tlb_valid_way
 499         cache_valid_idx = self.cache_valid_idx
 500         cache_tag_set = self.cache_tag_set
 501         req_addr = self.req_addr
 502         tlb_hit_way = self.tlb_hit_way
 503         tlb_hit = self.tlb_hit
 504         hit_set = self.hit_set
 505         hit_way = self.hit_way
 506         rel_match = self.rel_match
 507         req_index = self.req_index
 508         reload_tag = self.reload_tag
 509
 510         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 511                                     for i in range(TLB_NUM_WAYS))
 512         hit_way_set = HitWaySet()
 513
 514         # Test if pending request is a hit on any way
 515         # In order to make timing in virtual mode,
 516         # when we are using the TLB, we compare each
 517         # way with each of the real addresses from each way of
 518         # the TLB, and then decide later which match to use.
 519
 520         with m.If(virt_mode):
 521             for j in range(TLB_NUM_WAYS):
 522                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 523                 s_hit       = Signal()
 524                 s_pte       = Signal(TLB_PTE_BITS)
 525                 s_ra        = Signal(REAL_ADDR_BITS)
 526                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 527                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 528                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 529                 comb += s_tag.eq(get_tag(s_ra))
 530
 531                 for i in range(NUM_WAYS):
 532                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 533                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 534                                   (read_tag(i, cache_tag_set) == s_tag)
 535                                   & tlb_valid_way[j])
 536                     with m.If(is_tag_hit):
 537                         comb += hit_way_set[j].eq(i)
 538                         comb += s_hit.eq(1)
 539                 comb += hit_set[j].eq(s_hit)
 540                 with m.If(s_tag == reload_tag):
 541                     comb += rel_matches[j].eq(1)
 542             with m.If(tlb_hit):
 543                 comb += is_hit.eq(hit_set[tlb_hit_way])
 544                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 545                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 546         with m.Else():
 547             s_tag       = Signal(TAG_BITS)
 548             comb += s_tag.eq(get_tag(req_addr))
 549             for i in range(NUM_WAYS):
 550                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 551                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 552                           (read_tag(i, cache_tag_set) == s_tag))
 553                 with m.If(is_tag_hit):
 554                     comb += hit_way.eq(i)
 555                     comb += is_hit.eq(1)
 556             with m.If(s_tag == reload_tag):
 557                 comb += rel_match.eq(1)
 558
 559         return m
 560
 561
 562 class DCache(Elaboratable):
 563     """Set associative dcache write-through
 564     TODO (in no specific order):
 565     * See list in icache.vhdl
 566     * Complete load misses on the cycle when WB data comes instead of
 567       at the end of line (this requires dealing with requests coming in
 568       while not idle...)
 569     """
 570     def __init__(self):
 571         self.d_in      = LoadStore1ToDCacheType("d_in")
 572         self.d_out     = DCacheToLoadStore1Type("d_out")
 573
 574         self.m_in      = MMUToDCacheType("m_in")
 575         self.m_out     = DCacheToMMUType("m_out")
 576
 577         self.stall_out = Signal()
 578
 579         self.wb_out    = WBMasterOut()
 580         self.wb_in     = WBSlaveOut()
 581
 582         self.log_out   = Signal(20)
 583
 584     def stage_0(self, m, r0, r1, r0_full):
 585         """Latch the request in r0.req as long as we're not stalling
 586         """
 587         comb = m.d.comb
 588         sync = m.d.sync
 589         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 590
 591         r = RegStage0("stage0")
 592
 593         # TODO, this goes in unit tests and formal proofs
 594         with m.If(d_in.valid & m_in.valid):
 595             sync += Display("request collision loadstore vs MMU")
 596
 597         with m.If(m_in.valid):
 598             sync += r.req.valid.eq(1)
 599             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 600             sync += r.req.dcbz.eq(0)
 601             sync += r.req.nc.eq(0)
 602             sync += r.req.reserve.eq(0)
 603             sync += r.req.virt_mode.eq(1)
 604             sync += r.req.priv_mode.eq(1)
 605             sync += r.req.addr.eq(m_in.addr)
 606             sync += r.req.data.eq(m_in.pte)
 607             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 608             sync += r.tlbie.eq(m_in.tlbie)
 609             sync += r.doall.eq(m_in.doall)
 610             sync += r.tlbld.eq(m_in.tlbld)
 611             sync += r.mmu_req.eq(1)
 612         with m.Else():
 613             sync += r.req.eq(d_in)
 614             sync += r.tlbie.eq(0)
 615             sync += r.doall.eq(0)
 616             sync += r.tlbld.eq(0)
 617             sync += r.mmu_req.eq(0)
 618             with m.If(~(r1.full & r0_full)):
 619                 sync += r0.eq(r)
 620                 sync += r0_full.eq(r.req.valid)
 621
 622     def tlb_read(self, m, r0_stall, tlb_valid_way,
 623                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 624                  dtlb_tags, dtlb_ptes):
 625         """TLB
 626         Operates in the second cycle on the request latched in r0.req.
 627         TLB updates write the entry at the end of the second cycle.
 628         """
 629         comb = m.d.comb
 630         sync = m.d.sync
 631         m_in, d_in = self.m_in, self.d_in
 632
 633         index    = Signal(TLB_SET_BITS)
 634         addrbits = Signal(TLB_SET_BITS)
 635
 636         amin = TLB_LG_PGSZ
 637         amax = TLB_LG_PGSZ + TLB_SET_BITS
 638
 639         with m.If(m_in.valid):
 640             comb += addrbits.eq(m_in.addr[amin : amax])
 641         with m.Else():
 642             comb += addrbits.eq(d_in.addr[amin : amax])
 643         comb += index.eq(addrbits)
 644
 645         # If we have any op and the previous op isn't finished,
 646         # then keep the same output for next cycle.
 647         with m.If(~r0_stall):
 648             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 649             sync += tlb_tag_way.eq(dtlb_tags[index])
 650             sync += tlb_pte_way.eq(dtlb_ptes[index])
 651
 652     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 653         """Generate TLB PLRUs
 654         """
 655         comb = m.d.comb
 656         sync = m.d.sync
 657
 658         if TLB_NUM_WAYS == 0:
 659             return
 660         for i in range(TLB_SET_SIZE):
 661             # TLB PLRU interface
 662             tlb_plru        = PLRU(WAY_BITS)
 663             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 664             tlb_plru_acc_en = Signal()
 665
 666             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 667             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 668             comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 669             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 670
 671     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 672                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 673                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 674
 675         comb = m.d.comb
 676         sync = m.d.sync
 677
 678         hitway = Signal(TLB_WAY_BITS)
 679         hit    = Signal()
 680         eatag  = Signal(TLB_EA_TAG_BITS)
 681
 682         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 683         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 684         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 685
 686         for i in range(TLB_NUM_WAYS):
 687             is_tag_hit = Signal()
 688             comb += is_tag_hit.eq(tlb_valid_way[i]
 689                                   & read_tlb_tag(i, tlb_tag_way) == eatag)
 690             with m.If(is_tag_hit):
 691                 comb += hitway.eq(i)
 692                 comb += hit.eq(1)
 693
 694         comb += tlb_hit.eq(hit & r0_valid)
 695         comb += tlb_hit_way.eq(hitway)
 696
 697         with m.If(tlb_hit):
 698             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 699         with m.Else():
 700             comb += pte.eq(0)
 701         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 702         with m.If(r0.req.virt_mode):
 703             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 704                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 705                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 706             comb += perm_attr.eq(extract_perm_attr(pte))
 707         with m.Else():
 708             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 709                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 710
 711             comb += perm_attr.reference.eq(1)
 712             comb += perm_attr.changed.eq(1)
 713             comb += perm_attr.nocache.eq(0)
 714             comb += perm_attr.priv.eq(1)
 715             comb += perm_attr.rd_perm.eq(1)
 716             comb += perm_attr.wr_perm.eq(1)
 717
 718     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 719                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 720                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 721
 722         comb = m.d.comb
 723         sync = m.d.sync
 724
 725         tlbie    = Signal()
 726         tlbwe    = Signal()
 727
 728         comb += tlbie.eq(r0_valid & r0.tlbie)
 729         comb += tlbwe.eq(r0_valid & r0.tlbld)
 730
 731         m.submodules.tlb_update = d = DTLBUpdate()
 732         with m.If(tlbie & r0.doall):
 733             # clear all valid bits at once
 734             for i in range(TLB_SET_SIZE):
 735                 sync += dtlb_valid_bits[i].eq(0)
 736         with m.If(d.updated):
 737             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 738             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 739         with m.If(d.v_updated):
 740             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 741
 742         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 743
 744         comb += d.tlbie.eq(tlbie)
 745         comb += d.tlbwe.eq(tlbwe)
 746         comb += d.doall.eq(r0.doall)
 747         comb += d.tlb_hit.eq(tlb_hit)
 748         comb += d.tlb_hit_way.eq(tlb_hit_way)
 749         comb += d.tlb_tag_way.eq(tlb_tag_way)
 750         comb += d.tlb_pte_way.eq(tlb_pte_way)
 751         comb += d.tlb_req_index.eq(tlb_req_index)
 752
 753         with m.If(tlb_hit):
 754             comb += d.repl_way.eq(tlb_hit_way)
 755         with m.Else():
 756             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 757         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 758         comb += d.pte_data.eq(r0.req.data)
 759
 760     def maybe_plrus(self, m, r1, plru_victim):
 761         """Generate PLRUs
 762         """
 763         comb = m.d.comb
 764         sync = m.d.sync
 765
 766         if TLB_NUM_WAYS == 0:
 767             return
 768
 769         for i in range(NUM_LINES):
 770             # PLRU interface
 771             plru        = PLRU(WAY_BITS)
 772             setattr(m.submodules, "plru%d" % i, plru)
 773             plru_acc_en = Signal()
 774
 775             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 776             comb += plru.acc_en.eq(plru_acc_en)
 777             comb += plru.acc.eq(r1.hit_way)
 778             comb += plru_victim[i].eq(plru.lru_o)
 779
 780     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 781         """Cache tag RAM read port
 782         """
 783         comb = m.d.comb
 784         sync = m.d.sync
 785         m_in, d_in = self.m_in, self.d_in
 786
 787         index = Signal(INDEX_BITS)
 788
 789         with m.If(r0_stall):
 790             comb += index.eq(req_index)
 791         with m.Elif(m_in.valid):
 792             comb += index.eq(get_index(m_in.addr))
 793         with m.Else():
 794             comb += index.eq(get_index(d_in.addr))
 795         sync += cache_tag_set.eq(cache_tags[index])
 796
 797     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 798                        r0_valid, r1, cache_valids, replace_way,
 799                        use_forward1_next, use_forward2_next,
 800                        req_hit_way, plru_victim, rc_ok, perm_attr,
 801                        valid_ra, perm_ok, access_ok, req_op, req_go,
 802                        tlb_pte_way,
 803                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 804                        cancel_store, req_same_tag, r0_stall, early_req_row):
 805         """Cache request parsing and hit detection
 806         """
 807
 808         comb = m.d.comb
 809         sync = m.d.sync
 810         m_in, d_in = self.m_in, self.d_in
 811
 812         is_hit      = Signal()
 813         hit_way     = Signal(WAY_BITS)
 814         op          = Signal(Op)
 815         opsel       = Signal(3)
 816         go          = Signal()
 817         nc          = Signal()
 818         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 819                                   for i in range(TLB_NUM_WAYS))
 820         cache_valid_idx = Signal(NUM_WAYS)
 821
 822         # Extract line, row and tag from request
 823         comb += req_index.eq(get_index(r0.req.addr))
 824         comb += req_row.eq(get_row(r0.req.addr))
 825         comb += req_tag.eq(get_tag(ra))
 826
 827         if False: # display on comb is a bit... busy.
 828             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 829                     r0.req.addr, ra, req_index, req_tag, req_row)
 830
 831         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 832         comb += cache_valid_idx.eq(cache_valids[req_index])
 833
 834         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 835                                 tlb_valid_way, tlb_hit_way,
 836                                 cache_valid_idx, cache_tag_set,
 837                                 r0.req.addr,
 838                                 hit_set)
 839
 840         comb += dc.tlb_hit.eq(tlb_hit)
 841         comb += dc.reload_tag.eq(r1.reload_tag)
 842         comb += dc.virt_mode.eq(r0.req.virt_mode)
 843         comb += dc.go.eq(go)
 844         comb += dc.req_index.eq(req_index)
 845         comb += is_hit.eq(dc.is_hit)
 846         comb += hit_way.eq(dc.hit_way)
 847         comb += req_same_tag.eq(dc.rel_match)
 848
 849         # See if the request matches the line currently being reloaded
 850         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 851                   (req_index == r1.store_index) & req_same_tag):
 852             # For a store, consider this a hit even if the row isn't
 853             # valid since it will be by the time we perform the store.
 854             # For a load, check the appropriate row valid bit.
 855             valid = r1.rows_valid[req_row[:ROW_LINE_BITS]]
 856             comb += is_hit.eq(~r0.req.load | valid)
 857             comb += hit_way.eq(replace_way)
 858
 859         # Whether to use forwarded data for a load or not
 860         with m.If((get_row(r1.req.real_addr) == req_row) &
 861                   (r1.req.hit_way == hit_way)):
 862             # Only need to consider r1.write_bram here, since if we
 863             # are writing refill data here, then we don't have a
 864             # cache hit this cycle on the line being refilled.
 865             # (There is the possibility that the load following the
 866             # load miss that started the refill could be to the old
 867             # contents of the victim line, since it is a couple of
 868             # cycles after the refill starts before we see the updated
 869             # cache tag. In that case we don't use the bypass.)
 870             comb += use_forward1_next.eq(r1.write_bram)
 871         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 872             comb += use_forward2_next.eq(r1.forward_valid1)
 873
 874         # The way that matched on a hit
 875         comb += req_hit_way.eq(hit_way)
 876
 877         # The way to replace on a miss
 878         with m.If(r1.write_tag):
 879             comb += replace_way.eq(plru_victim[r1.store_index])
 880         with m.Else():
 881             comb += replace_way.eq(r1.store_way)
 882
 883         # work out whether we have permission for this access
 884         # NB we don't yet implement AMR, thus no KUAP
 885         comb += rc_ok.eq(perm_attr.reference
 886                          & (r0.req.load | perm_attr.changed)
 887                 )
 888         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 889                            (perm_attr.wr_perm |
 890                               (r0.req.load & perm_attr.rd_perm)))
 891         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 892         # Combine the request and cache hit status to decide what
 893         # operation needs to be done
 894         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 895         comb += op.eq(Op.OP_NONE)
 896         with m.If(go):
 897             with m.If(~access_ok):
 898                 comb += op.eq(Op.OP_BAD)
 899             with m.Elif(cancel_store):
 900                 comb += op.eq(Op.OP_STCX_FAIL)
 901             with m.Else():
 902                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 903                 with m.Switch(opsel):
 904                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 905                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 906                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 907                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 908                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 909                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 910                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 911                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 912         comb += req_op.eq(op)
 913         comb += req_go.eq(go)
 914
 915         # Version of the row number that is valid one cycle earlier
 916         # in the cases where we need to read the cache data BRAM.
 917         # If we're stalling then we need to keep reading the last
 918         # row requested.
 919         with m.If(~r0_stall):
 920             with m.If(m_in.valid):
 921                 comb += early_req_row.eq(get_row(m_in.addr))
 922             with m.Else():
 923                 comb += early_req_row.eq(get_row(d_in.addr))
 924         with m.Else():
 925             comb += early_req_row.eq(req_row)
 926
 927     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 928                          r0_valid, r0, reservation):
 929         """Handle load-with-reservation and store-conditional instructions
 930         """
 931         comb = m.d.comb
 932         sync = m.d.sync
 933
 934         with m.If(r0_valid & r0.req.reserve):
 935             # XXX generate alignment interrupt if address
 936             # is not aligned XXX or if r0.req.nc = '1'
 937             with m.If(r0.req.load):
 938                 comb += set_rsrv.eq(1) # load with reservation
 939             with m.Else():
 940                 comb += clear_rsrv.eq(1) # store conditional
 941                 with m.If(~reservation.valid |
 942                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 943                     comb += cancel_store.eq(1)
 944
 945     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 946                         reservation, r0):
 947
 948         comb = m.d.comb
 949         sync = m.d.sync
 950
 951         with m.If(r0_valid & access_ok):
 952             with m.If(clear_rsrv):
 953                 sync += reservation.valid.eq(0)
 954             with m.Elif(set_rsrv):
 955                 sync += reservation.valid.eq(1)
 956                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 957
 958     def writeback_control(self, m, r1, cache_out):
 959         """Return data for loads & completion control logic
 960         """
 961         comb = m.d.comb
 962         sync = m.d.sync
 963         d_out, m_out = self.d_out, self.m_out
 964
 965         data_out = Signal(64)
 966         data_fwd = Signal(64)
 967
 968         # Use the bypass if are reading the row that was
 969         # written 1 or 2 cycles ago, including for the
 970         # slow_valid = 1 case (i.e. completing a load
 971         # miss or a non-cacheable load).
 972         with m.If(r1.use_forward1):
 973             comb += data_fwd.eq(r1.forward_data1)
 974         with m.Else():
 975             comb += data_fwd.eq(r1.forward_data2)
 976
 977         comb += data_out.eq(cache_out[r1.hit_way])
 978
 979         for i in range(8):
 980             with m.If(r1.forward_sel[i]):
 981                 dsel = data_fwd.word_select(i, 8)
 982                 comb += data_out.word_select(i, 8).eq(dsel)
 983
 984         comb += d_out.valid.eq(r1.ls_valid)
 985         comb += d_out.data.eq(data_out)
 986         comb += d_out.store_done.eq(~r1.stcx_fail)
 987         comb += d_out.error.eq(r1.ls_error)
 988         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 989
 990         # Outputs to MMU
 991         comb += m_out.done.eq(r1.mmu_done)
 992         comb += m_out.err.eq(r1.mmu_error)
 993         comb += m_out.data.eq(data_out)
 994
 995         # We have a valid load or store hit or we just completed
 996         # a slow op such as a load miss, a NC load or a store
 997         #
 998         # Note: the load hit is delayed by one cycle. However it
 999         # can still not collide with r.slow_valid (well unless I
1000         # miscalculated) because slow_valid can only be set on a
1001         # subsequent request and not on its first cycle (the state
1002         # machine must have advanced), which makes slow_valid
1003         # at least 2 cycles from the previous hit_load_valid.
1004
1005         # Sanity: Only one of these must be set in any given cycle
1006
1007         if False: # TODO: need Display to get this to work
1008             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1009             "unexpected slow_valid collision with stcx_fail"
1010
1011             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1012              "unexpected hit_load_delayed collision with slow_valid"
1013
1014         with m.If(~r1.mmu_req):
1015             # Request came from loadstore1...
1016             # Load hit case is the standard path
1017             with m.If(r1.hit_load_valid):
1018                 sync += Display("completing load hit data=%x", data_out)
1019
1020             # error cases complete without stalling
1021             with m.If(r1.ls_error):
1022                 sync += Display("completing ld/st with error")
1023
1024             # Slow ops (load miss, NC, stores)
1025             with m.If(r1.slow_valid):
1026                 sync += Display("completing store or load miss data=%x",
1027                                 data_out)
1028
1029         with m.Else():
1030             # Request came from MMU
1031             with m.If(r1.hit_load_valid):
1032                 sync += Display("completing load hit to MMU, data=%x",
1033                                 m_out.data)
1034             # error cases complete without stalling
1035             with m.If(r1.mmu_error):
1036                 sync += Display("combpleting MMU ld with error")
1037
1038             # Slow ops (i.e. load miss)
1039             with m.If(r1.slow_valid):
1040                 sync += Display("completing MMU load miss, data=%x",
1041                                 m_out.data)
1042
1043     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1044         """rams
1045         Generate a cache RAM for each way. This handles the normal
1046         reads, writes from reloads and the special store-hit update
1047         path as well.
1048
1049         Note: the BRAMs have an extra read buffer, meaning the output
1050         is pipelined an extra cycle. This differs from the
1051         icache. The writeback logic needs to take that into
1052         account by using 1-cycle delayed signals for load hits.
1053         """
1054         comb = m.d.comb
1055         wb_in = self.wb_in
1056
1057         for i in range(NUM_WAYS):
1058             do_read  = Signal(name="do_rd%d" % i)
1059             rd_addr  = Signal(ROW_BITS)
1060             do_write = Signal(name="do_wr%d" % i)
1061             wr_addr  = Signal(ROW_BITS)
1062             wr_data  = Signal(WB_DATA_BITS)
1063             wr_sel   = Signal(ROW_SIZE)
1064             wr_sel_m = Signal(ROW_SIZE)
1065             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
1066
1067             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1068             setattr(m.submodules, "cacheram_%d" % i, way)
1069
1070             comb += way.rd_en.eq(do_read)
1071             comb += way.rd_addr.eq(rd_addr)
1072             comb += _d_out.eq(way.rd_data_o)
1073             comb += way.wr_sel.eq(wr_sel_m)
1074             comb += way.wr_addr.eq(wr_addr)
1075             comb += way.wr_data.eq(wr_data)
1076
1077             # Cache hit reads
1078             comb += do_read.eq(1)
1079             comb += rd_addr.eq(early_req_row[:ROW_BITS])
1080             comb += cache_out[i].eq(_d_out)
1081
1082             # Write mux:
1083             #
1084             # Defaults to wishbone read responses (cache refill)
1085             #
1086             # For timing, the mux on wr_data/sel/addr is not
1087             # dependent on anything other than the current state.
1088
1089             with m.If(r1.write_bram):
1090                 # Write store data to BRAM.  This happens one
1091                 # cycle after the store is in r0.
1092                 comb += wr_data.eq(r1.req.data)
1093                 comb += wr_sel.eq(r1.req.byte_sel)
1094                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1095
1096                 with m.If(i == r1.req.hit_way):
1097                     comb += do_write.eq(1)
1098             with m.Else():
1099                 # Otherwise, we might be doing a reload or a DCBZ
1100                 with m.If(r1.dcbz):
1101                     comb += wr_data.eq(0)
1102                 with m.Else():
1103                     comb += wr_data.eq(wb_in.dat)
1104                 comb += wr_addr.eq(r1.store_row)
1105                 comb += wr_sel.eq(~0) # all 1s
1106
1107             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1108                       & wb_in.ack & (replace_way == i)):
1109                 comb += do_write.eq(1)
1110
1111             # Mask write selects with do_write since BRAM
1112             # doesn't have a global write-enable
1113             with m.If(do_write):
1114                 comb += wr_sel_m.eq(wr_sel)
1115
1116     # Cache hit synchronous machine for the easy case.
1117     # This handles load hits.
1118     # It also handles error cases (TLB miss, cache paradox)
1119     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1120                         req_hit_way, req_index, req_tag, access_ok,
1121                         tlb_hit, tlb_hit_way, tlb_req_index):
1122
1123         comb = m.d.comb
1124         sync = m.d.sync
1125
1126         with m.If(req_op != Op.OP_NONE):
1127             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1128                     req_op, r0.req.addr, r0.req.nc,
1129                     req_index, req_tag, req_hit_way)
1130
1131         with m.If(r0_valid):
1132             sync += r1.mmu_req.eq(r0.mmu_req)
1133
1134         # Fast path for load/store hits.
1135         # Set signals for the writeback controls.
1136         sync += r1.hit_way.eq(req_hit_way)
1137         sync += r1.hit_index.eq(req_index)
1138
1139         with m.If(req_op == Op.OP_LOAD_HIT):
1140             sync += r1.hit_load_valid.eq(1)
1141         with m.Else():
1142             sync += r1.hit_load_valid.eq(0)
1143
1144         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1145             sync += r1.cache_hit.eq(1)
1146         with m.Else():
1147             sync += r1.cache_hit.eq(0)
1148
1149         with m.If(req_op == Op.OP_BAD):
1150             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1151             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1152             sync += r1.ls_error.eq(~r0.mmu_req)
1153             sync += r1.mmu_error.eq(r0.mmu_req)
1154             sync += r1.cache_paradox.eq(access_ok)
1155
1156             with m.Else():
1157                 sync += r1.ls_error.eq(0)
1158                 sync += r1.mmu_error.eq(0)
1159                 sync += r1.cache_paradox.eq(0)
1160
1161         with m.If(req_op == Op.OP_STCX_FAIL):
1162             r1.stcx_fail.eq(1)
1163         with m.Else():
1164             sync += r1.stcx_fail.eq(0)
1165
1166         # Record TLB hit information for updating TLB PLRU
1167         sync += r1.tlb_hit.eq(tlb_hit)
1168         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1169         sync += r1.tlb_hit_index.eq(tlb_req_index)
1170
1171     # Memory accesses are handled by this state machine:
1172     #
1173     #   * Cache load miss/reload (in conjunction with "rams")
1174     #   * Load hits for non-cachable forms
1175     #   * Stores (the collision case is handled in "rams")
1176     #
1177     # All wishbone requests generation is done here.
1178     # This machine operates at stage 1.
1179     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1180                     cache_valids, r0, replace_way,
1181                     req_hit_way, req_same_tag,
1182                     r0_valid, req_op, cache_tags, req_go, ra):
1183
1184         comb = m.d.comb
1185         sync = m.d.sync
1186         wb_in = self.wb_in
1187
1188         req         = MemAccessRequest("mreq_ds")
1189         acks        = Signal(3)
1190         adjust_acks = Signal(3)
1191
1192         req_row = Signal(ROW_BITS)
1193         req_idx = Signal(INDEX_BITS)
1194         req_tag = Signal(TAG_BITS)
1195         comb += req_idx.eq(get_index(req.real_addr))
1196         comb += req_row.eq(get_row(req.real_addr))
1197         comb += req_tag.eq(get_tag(req.real_addr))
1198
1199         sync += r1.use_forward1.eq(use_forward1_next)
1200         sync += r1.forward_sel.eq(0)
1201
1202         with m.If(use_forward1_next):
1203             sync += r1.forward_sel.eq(r1.req.byte_sel)
1204         with m.Elif(use_forward2_next):
1205             sync += r1.forward_sel.eq(r1.forward_sel1)
1206
1207         sync += r1.forward_data2.eq(r1.forward_data1)
1208         with m.If(r1.write_bram):
1209             sync += r1.forward_data1.eq(r1.req.data)
1210             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1211             sync += r1.forward_way1.eq(r1.req.hit_way)
1212             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1213             sync += r1.forward_valid1.eq(1)
1214         with m.Else():
1215             with m.If(r1.dcbz):
1216                 sync += r1.forward_data1.eq(0)
1217             with m.Else():
1218                 sync += r1.forward_data1.eq(wb_in.dat)
1219             sync += r1.forward_sel1.eq(~0) # all 1s
1220             sync += r1.forward_way1.eq(replace_way)
1221             sync += r1.forward_row1.eq(r1.store_row)
1222             sync += r1.forward_valid1.eq(0)
1223
1224         # One cycle pulses reset
1225         sync += r1.slow_valid.eq(0)
1226         sync += r1.write_bram.eq(0)
1227         sync += r1.inc_acks.eq(0)
1228         sync += r1.dec_acks.eq(0)
1229
1230         sync += r1.ls_valid.eq(0)
1231         # complete tlbies and TLB loads in the third cycle
1232         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1233
1234         with m.If((req_op == Op.OP_LOAD_HIT)
1235                   | (req_op == Op.OP_STCX_FAIL)):
1236             with m.If(~r0.mmu_req):
1237                 sync += r1.ls_valid.eq(1)
1238             with m.Else():
1239                 sync += r1.mmu_done.eq(1)
1240
1241         with m.If(r1.write_tag):
1242             # Store new tag in selected way
1243             for i in range(NUM_WAYS):
1244                 with m.If(i == replace_way):
1245                     ct = Signal(TAG_RAM_WIDTH)
1246                     comb += ct.eq(cache_tags[r1.store_index])
1247                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1248                     sync += cache_tags[r1.store_index].eq(ct)
1249             sync += r1.store_way.eq(replace_way)
1250             sync += r1.write_tag.eq(0)
1251
1252         # Take request from r1.req if there is one there,
1253         # else from req_op, ra, etc.
1254         with m.If(r1.full):
1255             comb += req.eq(r1.req)
1256         with m.Else():
1257             comb += req.op.eq(req_op)
1258             comb += req.valid.eq(req_go)
1259             comb += req.mmu_req.eq(r0.mmu_req)
1260             comb += req.dcbz.eq(r0.req.dcbz)
1261             comb += req.real_addr.eq(ra)
1262
1263             with m.If(~r0.req.dcbz):
1264                 comb += req.data.eq(r0.req.data)
1265             with m.Else():
1266                 comb += req.data.eq(0)
1267
1268             # Select all bytes for dcbz
1269             # and for cacheable loads
1270             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1271                 comb += req.byte_sel.eq(~0) # all 1s
1272             with m.Else():
1273                 comb += req.byte_sel.eq(r0.req.byte_sel)
1274             comb += req.hit_way.eq(req_hit_way)
1275             comb += req.same_tag.eq(req_same_tag)
1276
1277             # Store the incoming request from r0,
1278             # if it is a slow request
1279             # Note that r1.full = 1 implies req_op = OP_NONE
1280             with m.If((req_op == Op.OP_LOAD_MISS)
1281                       | (req_op == Op.OP_LOAD_NC)
1282                       | (req_op == Op.OP_STORE_MISS)
1283                       | (req_op == Op.OP_STORE_HIT)):
1284                 sync += r1.req.eq(req)
1285                 sync += r1.full.eq(1)
1286
1287         # Main state machine
1288         with m.Switch(r1.state):
1289
1290             with m.Case(State.IDLE):
1291                 sync += r1.wb.adr.eq(req.real_addr)
1292                 sync += r1.wb.sel.eq(req.byte_sel)
1293                 sync += r1.wb.dat.eq(req.data)
1294                 sync += r1.dcbz.eq(req.dcbz)
1295
1296                 # Keep track of our index and way
1297                 # for subsequent stores.
1298                 sync += r1.store_index.eq(req_idx)
1299                 sync += r1.store_row.eq(req_row)
1300                 sync += r1.end_row_ix.eq(get_row_of_line(req_row))
1301                 sync += r1.reload_tag.eq(req_tag)
1302                 sync += r1.req.same_tag.eq(1)
1303
1304                 with m.If(req.op == Op.OP_STORE_HIT):
1305                     sync += r1.store_way.eq(req.hit_way)
1306
1307                 # Reset per-row valid bits,
1308                 # ready for handling OP_LOAD_MISS
1309                 for i in range(ROW_PER_LINE):
1310                     sync += r1.rows_valid[i].eq(0)
1311
1312                 with m.If(req_op != Op.OP_NONE):
1313                     sync += Display("cache op %d", req.op)
1314
1315                 with m.Switch(req.op):
1316                     with m.Case(Op.OP_LOAD_HIT):
1317                         # stay in IDLE state
1318                         pass
1319
1320                     with m.Case(Op.OP_LOAD_MISS):
1321                         sync += Display("cache miss real addr: %x " \
1322                                 "idx: %x tag: %x",
1323                                 req.real_addr, req_row, req_tag)
1324
1325                         # Start the wishbone cycle
1326                         sync += r1.wb.we.eq(0)
1327                         sync += r1.wb.cyc.eq(1)
1328                         sync += r1.wb.stb.eq(1)
1329
1330                         # Track that we had one request sent
1331                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1332                         sync += r1.write_tag.eq(1)
1333
1334                     with m.Case(Op.OP_LOAD_NC):
1335                         sync += r1.wb.cyc.eq(1)
1336                         sync += r1.wb.stb.eq(1)
1337                         sync += r1.wb.we.eq(0)
1338                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1339
1340                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1341                         with m.If(~req.dcbz):
1342                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1343                             sync += r1.acks_pending.eq(1)
1344                             sync += r1.full.eq(0)
1345                             sync += r1.slow_valid.eq(1)
1346
1347                             with m.If(~req.mmu_req):
1348                                 sync += r1.ls_valid.eq(1)
1349                             with m.Else():
1350                                 sync += r1.mmu_done.eq(1)
1351
1352                             with m.If(req.op == Op.OP_STORE_HIT):
1353                                 sync += r1.write_bram.eq(1)
1354                         with m.Else():
1355                             # dcbz is handled much like a load miss except
1356                             # that we are writing to memory instead of reading
1357                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1358
1359                             with m.If(req.op == Op.OP_STORE_MISS):
1360                                 sync += r1.write_tag.eq(1)
1361
1362                         sync += r1.wb.we.eq(1)
1363                         sync += r1.wb.cyc.eq(1)
1364                         sync += r1.wb.stb.eq(1)
1365
1366                     # OP_NONE and OP_BAD do nothing
1367                     # OP_BAD & OP_STCX_FAIL were
1368                     # handled above already
1369                     with m.Case(Op.OP_NONE):
1370                         pass
1371                     with m.Case(Op.OP_BAD):
1372                         pass
1373                     with m.Case(Op.OP_STCX_FAIL):
1374                         pass
1375
1376             with m.Case(State.RELOAD_WAIT_ACK):
1377                 ld_stbs_done = Signal()
1378                 # Requests are all sent if stb is 0
1379                 comb += ld_stbs_done.eq(~r1.wb.stb)
1380
1381                 with m.If((~wb_in.stall) & r1.wb.stb):
1382                     # That was the last word?
1383                     # We are done sending.
1384                     # Clear stb and set ld_stbs_done
1385                     # so we can handle an eventual
1386                     # last ack on the same cycle.
1387                     with m.If(is_last_row_addr(r1.wb.adr, r1.end_row_ix)):
1388                         sync += r1.wb.stb.eq(0)
1389                         comb += ld_stbs_done.eq(1)
1390
1391                     # Calculate the next row address in the current cache line
1392                     rarange = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1393                     comb += rarange.eq(r1.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS]+1)
1394                     sync += r1.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
1395
1396                 # Incoming acks processing
1397                 sync += r1.forward_valid1.eq(wb_in.ack)
1398                 with m.If(wb_in.ack):
1399                     sync += r1.rows_valid[r1.store_row[:ROW_LINE_BITS]].eq(1)
1400
1401                     # If this is the data we were looking for,
1402                     # we can complete the request next cycle.
1403                     # Compare the whole address in case the
1404                     # request in r1.req is not the one that
1405                     # started this refill.
1406                     with m.If(r1.full & r1.req.same_tag &
1407                               ((r1.dcbz & r1.req.dcbz) |
1408                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1409                                 (r1.store_row == get_row(r1.req.real_addr))):
1410                         sync += r1.full.eq(0)
1411                         sync += r1.slow_valid.eq(1)
1412                         with m.If(~r1.mmu_req):
1413                             sync += r1.ls_valid.eq(1)
1414                         with m.Else():
1415                             sync += r1.mmu_done.eq(1)
1416                         sync += r1.forward_sel.eq(~0) # all 1s
1417                         sync += r1.use_forward1.eq(1)
1418
1419                     # Check for completion
1420                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1421                                                       r1.end_row_ix)):
1422                         # Complete wishbone cycle
1423                         sync += r1.wb.cyc.eq(0)
1424
1425                         # Cache line is now valid
1426                         cv = Signal(INDEX_BITS)
1427                         comb += cv.eq(cache_valids[r1.store_index])
1428                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1429                         sync += cache_valids[r1.store_index].eq(cv)
1430                         sync += r1.state.eq(State.IDLE)
1431
1432                     # Increment store row counter
1433                     sync += r1.store_row.eq(next_row(r1.store_row))
1434
1435             with m.Case(State.STORE_WAIT_ACK):
1436                 st_stbs_done = Signal()
1437                 comb += st_stbs_done.eq(~r1.wb.stb)
1438                 comb += acks.eq(r1.acks_pending)
1439
1440                 with m.If(r1.inc_acks != r1.dec_acks):
1441                     with m.If(r1.inc_acks):
1442                         comb += adjust_acks.eq(acks + 1)
1443                     with m.Else():
1444                         comb += adjust_acks.eq(acks - 1)
1445                 with m.Else():
1446                     comb += adjust_acks.eq(acks)
1447
1448                 sync += r1.acks_pending.eq(adjust_acks)
1449
1450                 # Clear stb when slave accepted request
1451                 with m.If(~wb_in.stall):
1452                     # See if there is another store waiting
1453                     # to be done which is in the same real page.
1454                     with m.If(req.valid):
1455                         ra = req.real_addr[0:SET_SIZE_BITS]
1456                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1457                         sync += r1.wb.dat.eq(req.data)
1458                         sync += r1.wb.sel.eq(req.byte_sel)
1459
1460                     with m.Elif((adjust_acks < 7) & req.same_tag &
1461                                 ((req.op == Op.OP_STORE_MISS)
1462                                  | (req.op == Op.OP_STORE_HIT))):
1463                         sync += r1.wb.stb.eq(1)
1464                         comb += st_stbs_done.eq(0)
1465
1466                         with m.If(req.op == Op.OP_STORE_HIT):
1467                             sync += r1.write_bram.eq(1)
1468                         sync += r1.full.eq(0)
1469                         sync += r1.slow_valid.eq(1)
1470
1471                         # Store requests never come from the MMU
1472                         sync += r1.ls_valid.eq(1)
1473                         comb += st_stbs_done.eq(0)
1474                         sync += r1.inc_acks.eq(1)
1475                     with m.Else():
1476                         sync += r1.wb.stb.eq(0)
1477                         comb += st_stbs_done.eq(1)
1478
1479                 # Got ack ? See if complete.
1480                 with m.If(wb_in.ack):
1481                     with m.If(st_stbs_done & (adjust_acks == 1)):
1482                         sync += r1.state.eq(State.IDLE)
1483                         sync += r1.wb.cyc.eq(0)
1484                         sync += r1.wb.stb.eq(0)
1485                     sync += r1.dec_acks.eq(1)
1486
1487             with m.Case(State.NC_LOAD_WAIT_ACK):
1488                 # Clear stb when slave accepted request
1489                 with m.If(~wb_in.stall):
1490                     sync += r1.wb.stb.eq(0)
1491
1492                 # Got ack ? complete.
1493                 with m.If(wb_in.ack):
1494                     sync += r1.state.eq(State.IDLE)
1495                     sync += r1.full.eq(0)
1496                     sync += r1.slow_valid.eq(1)
1497
1498                     with m.If(~r1.mmu_req):
1499                         sync += r1.ls_valid.eq(1)
1500                     with m.Else():
1501                         sync += r1.mmu_done.eq(1)
1502
1503                     sync += r1.forward_sel.eq(~0) # all 1s
1504                     sync += r1.use_forward1.eq(1)
1505                     sync += r1.wb.cyc.eq(0)
1506                     sync += r1.wb.stb.eq(0)
1507
1508     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1509
1510         sync = m.d.sync
1511         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1512
1513         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1514                                stall_out, req_op[:3], d_out.valid, d_out.error,
1515                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1516                                r1.wb.adr[3:6]))
1517
1518     def elaborate(self, platform):
1519
1520         m = Module()
1521         comb = m.d.comb
1522
1523         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1524         cache_tags       = CacheTagArray()
1525         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1526         cache_valids = CacheValidBitsArray()
1527
1528         # TODO attribute ram_style : string;
1529         # TODO attribute ram_style of cache_tags : signal is "distributed";
1530
1531         """note: these are passed to nmigen.hdl.Memory as "attributes".
1532            don't know how, just that they are.
1533         """
1534         dtlb_valid_bits = TLBValidBitsArray()
1535         dtlb_tags       = TLBTagsArray()
1536         dtlb_ptes       = TLBPtesArray()
1537         # TODO attribute ram_style of
1538         #  dtlb_tags : signal is "distributed";
1539         # TODO attribute ram_style of
1540         #  dtlb_ptes : signal is "distributed";
1541
1542         r0      = RegStage0("r0")
1543         r0_full = Signal()
1544
1545         r1 = RegStage1("r1")
1546
1547         reservation = Reservation()
1548
1549         # Async signals on incoming request
1550         req_index    = Signal(INDEX_BITS)
1551         req_row      = Signal(ROW_BITS)
1552         req_hit_way  = Signal(WAY_BITS)
1553         req_tag      = Signal(TAG_BITS)
1554         req_op       = Signal(Op)
1555         req_data     = Signal(64)
1556         req_same_tag = Signal()
1557         req_go       = Signal()
1558
1559         early_req_row     = Signal(ROW_BITS)
1560
1561         cancel_store      = Signal()
1562         set_rsrv          = Signal()
1563         clear_rsrv        = Signal()
1564
1565         r0_valid          = Signal()
1566         r0_stall          = Signal()
1567
1568         use_forward1_next = Signal()
1569         use_forward2_next = Signal()
1570
1571         cache_out         = CacheRamOut()
1572
1573         plru_victim       = PLRUOut()
1574         replace_way       = Signal(WAY_BITS)
1575
1576         # Wishbone read/write/cache write formatting signals
1577         bus_sel           = Signal(8)
1578
1579         # TLB signals
1580         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1581         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1582         tlb_valid_way = Signal(TLB_NUM_WAYS)
1583         tlb_req_index = Signal(TLB_SET_BITS)
1584         tlb_hit       = Signal()
1585         tlb_hit_way   = Signal(TLB_WAY_BITS)
1586         pte           = Signal(TLB_PTE_BITS)
1587         ra            = Signal(REAL_ADDR_BITS)
1588         valid_ra      = Signal()
1589         perm_attr     = PermAttr("dc_perms")
1590         rc_ok         = Signal()
1591         perm_ok       = Signal()
1592         access_ok     = Signal()
1593
1594         tlb_plru_victim = TLBPLRUOut()
1595
1596         # we don't yet handle collisions between loadstore1 requests
1597         # and MMU requests
1598         comb += self.m_out.stall.eq(0)
1599
1600         # Hold off the request in r0 when r1 has an uncompleted request
1601         comb += r0_stall.eq(r0_full & r1.full)
1602         comb += r0_valid.eq(r0_full & ~r1.full)
1603         comb += self.stall_out.eq(r0_stall)
1604
1605         # Wire up wishbone request latch out of stage 1
1606         comb += self.wb_out.eq(r1.wb)
1607
1608         # call sub-functions putting everything together, using shared
1609         # signals established above
1610         self.stage_0(m, r0, r1, r0_full)
1611         self.tlb_read(m, r0_stall, tlb_valid_way,
1612                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1613                       dtlb_tags, dtlb_ptes)
1614         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1615                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1616                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1617         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1618                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1619                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1620         self.maybe_plrus(m, r1, plru_victim)
1621         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1622         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1623         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1624                            r0_valid, r1, cache_valids, replace_way,
1625                            use_forward1_next, use_forward2_next,
1626                            req_hit_way, plru_victim, rc_ok, perm_attr,
1627                            valid_ra, perm_ok, access_ok, req_op, req_go,
1628                            tlb_pte_way,
1629                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1630                            cancel_store, req_same_tag, r0_stall, early_req_row)
1631         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1632                            r0_valid, r0, reservation)
1633         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1634                            reservation, r0)
1635         self.writeback_control(m, r1, cache_out)
1636         self.rams(m, r1, early_req_row, cache_out, replace_way)
1637         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1638                         req_hit_way, req_index, req_tag, access_ok,
1639                         tlb_hit, tlb_hit_way, tlb_req_index)
1640         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1641                     cache_valids, r0, replace_way,
1642                     req_hit_way, req_same_tag,
1643                          r0_valid, req_op, cache_tags, req_go, ra)
1644         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1645
1646         return m
1647
1648 def dcache_load(dut, addr, nc=0):
1649     yield dut.d_in.load.eq(1)
1650     yield dut.d_in.nc.eq(nc)
1651     yield dut.d_in.addr.eq(addr)
1652     yield dut.d_in.byte_sel.eq(~0)
1653     yield dut.d_in.valid.eq(1)
1654     yield
1655     yield dut.d_in.valid.eq(0)
1656     yield dut.d_in.byte_sel.eq(0)
1657     yield
1658     while not (yield dut.d_out.valid):
1659         yield
1660     data = yield dut.d_out.data
1661     return data
1662
1663
1664 def dcache_store(dut, addr, data, nc=0):
1665     yield dut.d_in.load.eq(0)
1666     yield dut.d_in.nc.eq(nc)
1667     yield dut.d_in.data.eq(data)
1668     yield dut.d_in.byte_sel.eq(~0)
1669     yield dut.d_in.addr.eq(addr)
1670     yield dut.d_in.valid.eq(1)
1671     yield
1672     yield dut.d_in.valid.eq(0)
1673     yield dut.d_in.byte_sel.eq(0)
1674     yield
1675     while not (yield dut.d_out.valid):
1676         yield
1677
1678
1679 def dcache_random_sim(dut):
1680
1681     # start with stack of zeros
1682     sim_mem = [0] * 512
1683
1684     # clear stuff
1685     yield dut.d_in.valid.eq(0)
1686     yield dut.d_in.load.eq(0)
1687     yield dut.d_in.priv_mode.eq(1)
1688     yield dut.d_in.nc.eq(0)
1689     yield dut.d_in.addr.eq(0)
1690     yield dut.d_in.data.eq(0)
1691     yield dut.m_in.valid.eq(0)
1692     yield dut.m_in.addr.eq(0)
1693     yield dut.m_in.pte.eq(0)
1694     # wait 4 * clk_period
1695     yield
1696     yield
1697     yield
1698     yield
1699
1700     print ()
1701
1702     for i in range(256):
1703         addr = randint(0, 255)
1704         data = randint(0, (1<<64)-1)
1705         sim_mem[addr] = data
1706         addr *= 8
1707
1708         print ("testing %x data %x" % (addr, data))
1709
1710         yield from dcache_load(dut, addr)
1711         yield from dcache_store(dut, addr, data)
1712
1713         addr = randint(0, 255)
1714         sim_data = sim_mem[addr]
1715         addr *= 8
1716
1717         data = yield from dcache_load(dut, addr)
1718         assert data == sim_data, \
1719             "check %x data %x != %x" % (addr, data, sim_data)
1720
1721     for addr in range(256):
1722         data = yield from dcache_load(dut, addr*8)
1723         assert data == sim_mem[addr], \
1724             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1725
1726 def dcache_sim(dut):
1727     # clear stuff
1728     yield dut.d_in.valid.eq(0)
1729     yield dut.d_in.load.eq(0)
1730     yield dut.d_in.priv_mode.eq(1)
1731     yield dut.d_in.nc.eq(0)
1732     yield dut.d_in.addr.eq(0)
1733     yield dut.d_in.data.eq(0)
1734     yield dut.m_in.valid.eq(0)
1735     yield dut.m_in.addr.eq(0)
1736     yield dut.m_in.pte.eq(0)
1737     # wait 4 * clk_period
1738     yield
1739     yield
1740     yield
1741     yield
1742
1743     # Cacheable read of address 4
1744     data = yield from dcache_load(dut, 0x58)
1745     addr = yield dut.d_in.addr
1746     assert data == 0x0000001700000016, \
1747         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1748
1749     # Cacheable read of address 20
1750     data = yield from dcache_load(dut, 0x20)
1751     addr = yield dut.d_in.addr
1752     assert data == 0x0000000900000008, \
1753         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1754
1755     # Cacheable read of address 30
1756     data = yield from dcache_load(dut, 0x530)
1757     addr = yield dut.d_in.addr
1758     assert data == 0x0000014D0000014C, \
1759         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1760
1761     # 2nd Cacheable read of address 30
1762     data = yield from dcache_load(dut, 0x530)
1763     addr = yield dut.d_in.addr
1764     assert data == 0x0000014D0000014C, \
1765         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1766
1767     # Non-cacheable read of address 100
1768     data = yield from dcache_load(dut, 0x100, nc=1)
1769     addr = yield dut.d_in.addr
1770     assert data == 0x0000004100000040, \
1771         f"data @%x=%x expected 0000004100000040" % (addr, data)
1772
1773     # Store at address 530
1774     yield from dcache_store(dut, 0x530, 0x121)
1775
1776     # Store at address 30
1777     yield from dcache_store(dut, 0x530, 0x12345678)
1778
1779     # 3nd Cacheable read of address 530
1780     data = yield from dcache_load(dut, 0x530)
1781     addr = yield dut.d_in.addr
1782     assert data == 0x12345678, \
1783         f"data @%x=%x expected 0x12345678" % (addr, data)
1784
1785     # 4th Cacheable read of address 20
1786     data = yield from dcache_load(dut, 0x20)
1787     addr = yield dut.d_in.addr
1788     assert data == 0x0000000900000008, \
1789         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1790
1791     yield
1792     yield
1793     yield
1794     yield
1795
1796
1797 def test_dcache(mem, test_fn, test_name):
1798     dut = DCache()
1799
1800     memory = Memory(width=64, depth=16*64, init=mem)
1801     sram = SRAM(memory=memory, granularity=8)
1802
1803     m = Module()
1804     m.submodules.dcache = dut
1805     m.submodules.sram = sram
1806
1807     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1808     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1809     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1810     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1811     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr[3:])
1812     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1813
1814     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1815     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1816
1817     # nmigen Simulation
1818     sim = Simulator(m)
1819     sim.add_clock(1e-6)
1820
1821     sim.add_sync_process(wrap(test_fn(dut)))
1822     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1823         sim.run()
1824
1825 if __name__ == '__main__':
1826     dut = DCache()
1827     vl = rtlil.convert(dut, ports=[])
1828     with open("test_dcache.il", "w") as f:
1829         f.write(vl)
1830
1831     mem = []
1832     for i in range(0,512):
1833         mem.append((i*2)| ((i*2+1)<<32))
1834
1835     test_dcache(mem, dcache_sim, "")
1836     test_dcache(None, dcache_random_sim, "random")
1837