src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11 """
  12
  13 import sys
  14
  15 from nmutil.gtkw import write_gtkw
  16
  17 sys.setrecursionlimit(1000000)
  18
  19 from enum import Enum, unique
  20
  21 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  22 from nmutil.util import Display
  23
  24 from copy import deepcopy
  25 from random import randint, seed
  26
  27 from nmigen.cli import main
  28 from nmutil.iocontrol import RecordObject
  29 from nmigen.utils import log2_int
  30 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  31                                      DCacheToLoadStore1Type,
  32                                      MMUToDCacheType,
  33                                      DCacheToMMUType)
  34
  35 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  36                                 WBAddrType, WBDataType, WBSelType,
  37                                 WBMasterOut, WBSlaveOut,
  38                                 WBMasterOutVector, WBSlaveOutVector,
  39                                 WBIOMasterOut, WBIOSlaveOut)
  40
  41 from soc.experiment.cache_ram import CacheRam
  42 #from soc.experiment.plru import PLRU
  43 from nmutil.plru import PLRU
  44
  45 # for test
  46 from soc.bus.sram import SRAM
  47 from nmigen import Memory
  48 from nmigen.cli import rtlil
  49
  50 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  51 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  52 from nmutil.sim_tmp_alternative import Simulator
  53
  54 from nmutil.util import wrap
  55
  56
  57 # TODO: make these parameters of DCache at some point
  58 LINE_SIZE = 64    # Line size in bytes
  59 NUM_LINES = 16    # Number of lines in a set
  60 NUM_WAYS = 4      # Number of ways
  61 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  62 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  63 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  64 LOG_LENGTH = 0    # Non-zero to enable log data collection
  65
  66 # BRAM organisation: We never access more than
  67 #     -- WB_DATA_BITS at a time so to save
  68 #     -- resources we make the array only that wide, and
  69 #     -- use consecutive indices for to make a cache "line"
  70 #     --
  71 #     -- ROW_SIZE is the width in bytes of the BRAM
  72 #     -- (based on WB, so 64-bits)
  73 ROW_SIZE = WB_DATA_BITS // 8;
  74
  75 # ROW_PER_LINE is the number of row (wishbone
  76 # transactions) in a line
  77 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  78
  79 # BRAM_ROWS is the number of rows in BRAM needed
  80 # to represent the full dcache
  81 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  82
  83 print ("ROW_SIZE", ROW_SIZE)
  84 print ("ROW_PER_LINE", ROW_PER_LINE)
  85 print ("BRAM_ROWS", BRAM_ROWS)
  86 print ("NUM_WAYS", NUM_WAYS)
  87
  88 # Bit fields counts in the address
  89
  90 # REAL_ADDR_BITS is the number of real address
  91 # bits that we store
  92 REAL_ADDR_BITS = 56
  93
  94 # ROW_BITS is the number of bits to select a row
  95 ROW_BITS = log2_int(BRAM_ROWS)
  96
  97 # ROW_LINE_BITS is the number of bits to select
  98 # a row within a line
  99 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 100
 101 # LINE_OFF_BITS is the number of bits for
 102 # the offset in a cache line
 103 LINE_OFF_BITS = log2_int(LINE_SIZE)
 104
 105 # ROW_OFF_BITS is the number of bits for
 106 # the offset in a row
 107 ROW_OFF_BITS = log2_int(ROW_SIZE)
 108
 109 # INDEX_BITS is the number if bits to
 110 # select a cache line
 111 INDEX_BITS = log2_int(NUM_LINES)
 112
 113 # SET_SIZE_BITS is the log base 2 of the set size
 114 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 115
 116 # TAG_BITS is the number of bits of
 117 # the tag part of the address
 118 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 119
 120 # TAG_WIDTH is the width in bits of each way of the tag RAM
 121 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 122
 123 # WAY_BITS is the number of bits to select a way
 124 WAY_BITS = log2_int(NUM_WAYS)
 125
 126 # Example of layout for 32 lines of 64 bytes:
 127 layout = """\
 128   ..  tag    |index|  line  |
 129   ..         |   row   |    |
 130   ..         |     |---|    | ROW_LINE_BITS  (3)
 131   ..         |     |--- - --| LINE_OFF_BITS (6)
 132   ..         |         |- --| ROW_OFF_BITS  (3)
 133   ..         |----- ---|    | ROW_BITS      (8)
 134   ..         |-----|        | INDEX_BITS    (5)
 135   .. --------|              | TAG_BITS      (45)
 136 """
 137 print (layout)
 138 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 139             (TAG_BITS, INDEX_BITS, ROW_BITS,
 140              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 141 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 142 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 143 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 144
 145 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 146
 147 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 148
 149 def CacheTagArray():
 150     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 151                         for x in range(NUM_LINES))
 152
 153 def CacheValidBitsArray():
 154     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 155                         for x in range(NUM_LINES))
 156
 157 def RowPerLineValidArray():
 158     return Array(Signal(name="rows_valid%d" % x) \
 159                         for x in range(ROW_PER_LINE))
 160
 161 # L1 TLB
 162 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 163 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 164 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 165 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 166 TLB_PTE_BITS     = 64
 167 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 168
 169 def ispow2(x):
 170     return (1<<log2_int(x, False)) == x
 171
 172 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 173 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 174 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 175 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 176 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 177 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 178         "geometry bits don't add up"
 179 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 180         "geometry bits don't add up"
 181 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 182          "geometry bits don't add up"
 183 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 184 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 185
 186
 187 def TLBValidBitsArray():
 188     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 189                 for x in range(TLB_SET_SIZE))
 190
 191 def TLBTagEAArray():
 192     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 193                 for x in range (TLB_NUM_WAYS))
 194
 195 def TLBTagsArray():
 196     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 197                 for x in range (TLB_SET_SIZE))
 198
 199 def TLBPtesArray():
 200     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 201                 for x in range(TLB_SET_SIZE))
 202
 203 def HitWaySet():
 204     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 205                         for x in range(TLB_NUM_WAYS))
 206
 207 # Cache RAM interface
 208 def CacheRamOut():
 209     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 210                  for x in range(NUM_WAYS))
 211
 212 # PLRU output interface
 213 def PLRUOut():
 214     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 215                 for x in range(NUM_LINES))
 216
 217 # TLB PLRU output interface
 218 def TLBPLRUOut():
 219     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 220                 for x in range(TLB_SET_SIZE))
 221
 222 # Helper functions to decode incoming requests
 223 #
 224 # Return the cache line index (tag index) for an address
 225 def get_index(addr):
 226     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 227
 228 # Return the cache row index (data memory) for an address
 229 def get_row(addr):
 230     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 231
 232 # Return the index of a row within a line
 233 def get_row_of_line(row):
 234     return row[:ROW_BITS][:ROW_LINE_BITS]
 235
 236 # Returns whether this is the last row of a line
 237 def is_last_row_addr(addr, last):
 238     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 239
 240 # Returns whether this is the last row of a line
 241 def is_last_row(row, last):
 242     return get_row_of_line(row) == last
 243
 244 # Return the next row in the current cache line. We use a
 245 # dedicated function in order to limit the size of the
 246 # generated adder to be only the bits within a cache line
 247 # (3 bits with default settings)
 248 def next_row(row):
 249     row_v = row[0:ROW_LINE_BITS] + 1
 250     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 251
 252 # Get the tag value from the address
 253 def get_tag(addr):
 254     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 255
 256 # Read a tag from a tag memory row
 257 def read_tag(way, tagset):
 258     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 259
 260 # Read a TLB tag from a TLB tag memory row
 261 def read_tlb_tag(way, tags):
 262     return tags.word_select(way, TLB_EA_TAG_BITS)
 263
 264 # Write a TLB tag to a TLB tag memory row
 265 def write_tlb_tag(way, tags, tag):
 266     return read_tlb_tag(way, tags).eq(tag)
 267
 268 # Read a PTE from a TLB PTE memory row
 269 def read_tlb_pte(way, ptes):
 270     return ptes.word_select(way, TLB_PTE_BITS)
 271
 272 def write_tlb_pte(way, ptes, newpte):
 273     return read_tlb_pte(way, ptes).eq(newpte)
 274
 275
 276 # Record for storing permission, attribute, etc. bits from a PTE
 277 class PermAttr(RecordObject):
 278     def __init__(self, name=None):
 279         super().__init__(name=name)
 280         self.reference = Signal()
 281         self.changed   = Signal()
 282         self.nocache   = Signal()
 283         self.priv      = Signal()
 284         self.rd_perm   = Signal()
 285         self.wr_perm   = Signal()
 286
 287
 288 def extract_perm_attr(pte):
 289     pa = PermAttr()
 290     return pa;
 291
 292
 293 # Type of operation on a "valid" input
 294 @unique
 295 class Op(Enum):
 296     OP_NONE       = 0
 297     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 298     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 299     OP_LOAD_HIT   = 3 # Cache hit on load
 300     OP_LOAD_MISS  = 4 # Load missing cache
 301     OP_LOAD_NC    = 5 # Non-cachable load
 302     OP_STORE_HIT  = 6 # Store hitting cache
 303     OP_STORE_MISS = 7 # Store missing cache
 304
 305
 306 # Cache state machine
 307 @unique
 308 class State(Enum):
 309     IDLE             = 0 # Normal load hit processing
 310     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 311     STORE_WAIT_ACK   = 2 # Store wait ack
 312     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 313
 314
 315 # Dcache operations:
 316 #
 317 # In order to make timing, we use the BRAMs with
 318 # an output buffer, which means that the BRAM
 319 # output is delayed by an extra cycle.
 320 #
 321 # Thus, the dcache has a 2-stage internal pipeline
 322 # for cache hits with no stalls.
 323 #
 324 # All other operations are handled via stalling
 325 # in the first stage.
 326 #
 327 # The second stage can thus complete a hit at the same
 328 # time as the first stage emits a stall for a complex op.
 329 #
 330 # Stage 0 register, basically contains just the latched request
 331
 332 class RegStage0(RecordObject):
 333     def __init__(self, name=None):
 334         super().__init__(name=name)
 335         self.req     = LoadStore1ToDCacheType(name="lsmem")
 336         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 337         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 338         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 339         self.mmu_req = Signal() # indicates source of request
 340         self.d_valid = Signal() # indicates req.data is valid now
 341
 342
 343 class MemAccessRequest(RecordObject):
 344     def __init__(self, name=None):
 345         super().__init__(name=name)
 346         self.op        = Signal(Op)
 347         self.valid     = Signal()
 348         self.dcbz      = Signal()
 349         self.real_addr = Signal(REAL_ADDR_BITS)
 350         self.data      = Signal(64)
 351         self.byte_sel  = Signal(8)
 352         self.hit_way   = Signal(WAY_BITS)
 353         self.same_tag  = Signal()
 354         self.mmu_req   = Signal()
 355
 356
 357 # First stage register, contains state for stage 1 of load hits
 358 # and for the state machine used by all other operations
 359 class RegStage1(RecordObject):
 360     def __init__(self, name=None):
 361         super().__init__(name=name)
 362         # Info about the request
 363         self.full             = Signal() # have uncompleted request
 364         self.mmu_req          = Signal() # request is from MMU
 365         self.req              = MemAccessRequest(name="reqmem")
 366
 367         # Cache hit state
 368         self.hit_way          = Signal(WAY_BITS)
 369         self.hit_load_valid   = Signal()
 370         self.hit_index        = Signal(INDEX_BITS)
 371         self.cache_hit        = Signal()
 372
 373         # TLB hit state
 374         self.tlb_hit          = Signal()
 375         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 376         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 377
 378         # 2-stage data buffer for data forwarded from writes to reads
 379         self.forward_data1    = Signal(64)
 380         self.forward_data2    = Signal(64)
 381         self.forward_sel1     = Signal(8)
 382         self.forward_valid1   = Signal()
 383         self.forward_way1     = Signal(WAY_BITS)
 384         self.forward_row1     = Signal(ROW_BITS)
 385         self.use_forward1     = Signal()
 386         self.forward_sel      = Signal(8)
 387
 388         # Cache miss state (reload state machine)
 389         self.state            = Signal(State)
 390         self.dcbz             = Signal()
 391         self.write_bram       = Signal()
 392         self.write_tag        = Signal()
 393         self.slow_valid       = Signal()
 394         self.wb               = WBMasterOut("wb")
 395         self.reload_tag       = Signal(TAG_BITS)
 396         self.store_way        = Signal(WAY_BITS)
 397         self.store_row        = Signal(ROW_BITS)
 398         self.store_index      = Signal(INDEX_BITS)
 399         self.end_row_ix       = Signal(ROW_LINE_BITS)
 400         self.rows_valid       = RowPerLineValidArray()
 401         self.acks_pending     = Signal(3)
 402         self.inc_acks         = Signal()
 403         self.dec_acks         = Signal()
 404
 405         # Signals to complete (possibly with error)
 406         self.ls_valid         = Signal()
 407         self.ls_error         = Signal()
 408         self.mmu_done         = Signal()
 409         self.mmu_error        = Signal()
 410         self.cache_paradox    = Signal()
 411
 412         # Signal to complete a failed stcx.
 413         self.stcx_fail        = Signal()
 414
 415
 416 # Reservation information
 417 class Reservation(RecordObject):
 418     def __init__(self):
 419         super().__init__()
 420         self.valid = Signal()
 421         self.addr  = Signal(64-LINE_OFF_BITS)
 422
 423
 424 class DTLBUpdate(Elaboratable):
 425     def __init__(self):
 426         self.tlbie    = Signal()
 427         self.tlbwe    = Signal()
 428         self.doall    = Signal()
 429         self.updated  = Signal()
 430         self.v_updated  = Signal()
 431         self.tlb_hit    = Signal()
 432         self.tlb_req_index = Signal(TLB_SET_BITS)
 433
 434         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 435         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 436         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 437         self.repl_way        = Signal(TLB_WAY_BITS)
 438         self.eatag           = Signal(TLB_EA_TAG_BITS)
 439         self.pte_data        = Signal(TLB_PTE_BITS)
 440
 441         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 442
 443         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 444         self.pb_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 445         self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 446
 447     def elaborate(self, platform):
 448         m = Module()
 449         comb = m.d.comb
 450         sync = m.d.sync
 451
 452         tagset   = Signal(TLB_TAG_WAY_BITS)
 453         pteset   = Signal(TLB_PTE_WAY_BITS)
 454
 455         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 456         comb += db_out.eq(self.dv)
 457
 458         with m.If(self.tlbie & self.doall):
 459             pass # clear all back in parent
 460         with m.Elif(self.tlbie):
 461             with m.If(self.tlb_hit):
 462                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 463                 comb += self.v_updated.eq(1)
 464
 465         with m.Elif(self.tlbwe):
 466
 467             comb += tagset.eq(self.tlb_tag_way)
 468             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 469             comb += tb_out.eq(tagset)
 470
 471             comb += pteset.eq(self.tlb_pte_way)
 472             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 473             comb += pb_out.eq(pteset)
 474
 475             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 476
 477             comb += self.updated.eq(1)
 478             comb += self.v_updated.eq(1)
 479
 480         return m
 481
 482
 483 class DCachePendingHit(Elaboratable):
 484
 485     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 486                       cache_valid_idx, cache_tag_set,
 487                     req_addr,
 488                     hit_set):
 489
 490         self.go          = Signal()
 491         self.virt_mode   = Signal()
 492         self.is_hit      = Signal()
 493         self.tlb_hit     = Signal()
 494         self.hit_way     = Signal(WAY_BITS)
 495         self.rel_match   = Signal()
 496         self.req_index   = Signal(INDEX_BITS)
 497         self.reload_tag  = Signal(TAG_BITS)
 498
 499         self.tlb_hit_way = tlb_hit_way
 500         self.tlb_pte_way = tlb_pte_way
 501         self.tlb_valid_way = tlb_valid_way
 502         self.cache_valid_idx = cache_valid_idx
 503         self.cache_tag_set = cache_tag_set
 504         self.req_addr = req_addr
 505         self.hit_set = hit_set
 506
 507     def elaborate(self, platform):
 508         m = Module()
 509         comb = m.d.comb
 510         sync = m.d.sync
 511
 512         go = self.go
 513         virt_mode = self.virt_mode
 514         is_hit = self.is_hit
 515         tlb_pte_way = self.tlb_pte_way
 516         tlb_valid_way = self.tlb_valid_way
 517         cache_valid_idx = self.cache_valid_idx
 518         cache_tag_set = self.cache_tag_set
 519         req_addr = self.req_addr
 520         tlb_hit_way = self.tlb_hit_way
 521         tlb_hit = self.tlb_hit
 522         hit_set = self.hit_set
 523         hit_way = self.hit_way
 524         rel_match = self.rel_match
 525         req_index = self.req_index
 526         reload_tag = self.reload_tag
 527
 528         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 529                                     for i in range(TLB_NUM_WAYS))
 530         hit_way_set = HitWaySet()
 531
 532         # Test if pending request is a hit on any way
 533         # In order to make timing in virtual mode,
 534         # when we are using the TLB, we compare each
 535         # way with each of the real addresses from each way of
 536         # the TLB, and then decide later which match to use.
 537
 538         with m.If(virt_mode):
 539             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 540                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 541                 s_hit       = Signal()
 542                 s_pte       = Signal(TLB_PTE_BITS)
 543                 s_ra        = Signal(REAL_ADDR_BITS)
 544                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 545                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 546                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 547                 comb += s_tag.eq(get_tag(s_ra))
 548
 549                 for i in range(NUM_WAYS): # way_t
 550                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 551                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 552                                   (read_tag(i, cache_tag_set) == s_tag)
 553                                   & tlb_valid_way[j])
 554                     with m.If(is_tag_hit):
 555                         comb += hit_way_set[j].eq(i)
 556                         comb += s_hit.eq(1)
 557                 comb += hit_set[j].eq(s_hit)
 558                 with m.If(s_tag == reload_tag):
 559                     comb += rel_matches[j].eq(1)
 560             with m.If(tlb_hit):
 561                 comb += is_hit.eq(hit_set[tlb_hit_way])
 562                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 563                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 564         with m.Else():
 565             s_tag       = Signal(TAG_BITS)
 566             comb += s_tag.eq(get_tag(req_addr))
 567             for i in range(NUM_WAYS): # way_t
 568                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 569                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 570                           (read_tag(i, cache_tag_set) == s_tag))
 571                 with m.If(is_tag_hit):
 572                     comb += hit_way.eq(i)
 573                     comb += is_hit.eq(1)
 574             with m.If(s_tag == reload_tag):
 575                 comb += rel_match.eq(1)
 576
 577         return m
 578
 579
 580 class DCache(Elaboratable):
 581     """Set associative dcache write-through
 582     TODO (in no specific order):
 583     * See list in icache.vhdl
 584     * Complete load misses on the cycle when WB data comes instead of
 585       at the end of line (this requires dealing with requests coming in
 586       while not idle...)
 587     """
 588     def __init__(self):
 589         self.d_in      = LoadStore1ToDCacheType("d_in")
 590         self.d_out     = DCacheToLoadStore1Type("d_out")
 591
 592         self.m_in      = MMUToDCacheType("m_in")
 593         self.m_out     = DCacheToMMUType("m_out")
 594
 595         self.stall_out = Signal()
 596
 597         self.wb_out    = WBMasterOut("wb_out")
 598         self.wb_in     = WBSlaveOut("wb_in")
 599
 600         self.log_out   = Signal(20)
 601
 602     def stage_0(self, m, r0, r1, r0_full):
 603         """Latch the request in r0.req as long as we're not stalling
 604         """
 605         comb = m.d.comb
 606         sync = m.d.sync
 607         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 608
 609         r = RegStage0("stage0")
 610
 611         # TODO, this goes in unit tests and formal proofs
 612         with m.If(d_in.valid & m_in.valid):
 613             sync += Display("request collision loadstore vs MMU")
 614
 615         with m.If(m_in.valid):
 616             comb += r.req.valid.eq(1)
 617             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 618             comb += r.req.dcbz.eq(0)
 619             comb += r.req.nc.eq(0)
 620             comb += r.req.reserve.eq(0)
 621             comb += r.req.virt_mode.eq(0)
 622             comb += r.req.priv_mode.eq(1)
 623             comb += r.req.addr.eq(m_in.addr)
 624             comb += r.req.data.eq(m_in.pte)
 625             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 626             comb += r.tlbie.eq(m_in.tlbie)
 627             comb += r.doall.eq(m_in.doall)
 628             comb += r.tlbld.eq(m_in.tlbld)
 629             comb += r.mmu_req.eq(1)
 630         with m.Else():
 631             comb += r.req.eq(d_in)
 632             comb += r.req.data.eq(0)
 633             comb += r.tlbie.eq(0)
 634             comb += r.doall.eq(0)
 635             comb += r.tlbld.eq(0)
 636             comb += r.mmu_req.eq(0)
 637         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 638             sync += r0.eq(r)
 639             sync += r0_full.eq(r.req.valid)
 640             # Sample data the cycle after a request comes in from loadstore1.
 641             # If another request has come in already then the data will get
 642             # put directly into req.data below.
 643             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 644                      ~r0.mmu_req):
 645                 sync += r0.req.data.eq(d_in.data)
 646                 sync += r0.d_valid.eq(1)
 647
 648     def tlb_read(self, m, r0_stall, tlb_valid_way,
 649                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 650                  dtlb_tags, dtlb_ptes):
 651         """TLB
 652         Operates in the second cycle on the request latched in r0.req.
 653         TLB updates write the entry at the end of the second cycle.
 654         """
 655         comb = m.d.comb
 656         sync = m.d.sync
 657         m_in, d_in = self.m_in, self.d_in
 658
 659         index    = Signal(TLB_SET_BITS)
 660         addrbits = Signal(TLB_SET_BITS)
 661
 662         amin = TLB_LG_PGSZ
 663         amax = TLB_LG_PGSZ + TLB_SET_BITS
 664
 665         with m.If(m_in.valid):
 666             comb += addrbits.eq(m_in.addr[amin : amax])
 667         with m.Else():
 668             comb += addrbits.eq(d_in.addr[amin : amax])
 669         comb += index.eq(addrbits)
 670
 671         # If we have any op and the previous op isn't finished,
 672         # then keep the same output for next cycle.
 673         with m.If(~r0_stall):
 674             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 675             sync += tlb_tag_way.eq(dtlb_tags[index])
 676             sync += tlb_pte_way.eq(dtlb_ptes[index])
 677
 678     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 679         """Generate TLB PLRUs
 680         """
 681         comb = m.d.comb
 682         sync = m.d.sync
 683
 684         if TLB_NUM_WAYS == 0:
 685             return
 686         for i in range(TLB_SET_SIZE):
 687             # TLB PLRU interface
 688             tlb_plru        = PLRU(TLB_WAY_BITS)
 689             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 690             tlb_plru_acc_en = Signal()
 691
 692             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 693             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 694             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 695             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 696
 697     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 698                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 699                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 700
 701         comb = m.d.comb
 702
 703         hitway = Signal(TLB_WAY_BITS)
 704         hit    = Signal()
 705         eatag  = Signal(TLB_EA_TAG_BITS)
 706
 707         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 708         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 709         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 710
 711         for i in range(TLB_NUM_WAYS):
 712             is_tag_hit = Signal()
 713             comb += is_tag_hit.eq(tlb_valid_way[i]
 714                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 715             with m.If(is_tag_hit):
 716                 comb += hitway.eq(i)
 717                 comb += hit.eq(1)
 718
 719         comb += tlb_hit.eq(hit & r0_valid)
 720         comb += tlb_hit_way.eq(hitway)
 721
 722         with m.If(tlb_hit):
 723             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 724         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 725
 726         with m.If(r0.req.virt_mode):
 727             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 728                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 729                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 730             comb += perm_attr.reference.eq(pte[8])
 731             comb += perm_attr.changed.eq(pte[7])
 732             comb += perm_attr.nocache.eq(pte[5])
 733             comb += perm_attr.priv.eq(pte[3])
 734             comb += perm_attr.rd_perm.eq(pte[2])
 735             comb += perm_attr.wr_perm.eq(pte[1])
 736         with m.Else():
 737             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 738                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 739             comb += perm_attr.reference.eq(1)
 740             comb += perm_attr.changed.eq(1)
 741             comb += perm_attr.nocache.eq(0)
 742             comb += perm_attr.priv.eq(1)
 743             comb += perm_attr.rd_perm.eq(1)
 744             comb += perm_attr.wr_perm.eq(1)
 745
 746     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 747                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 748                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 749
 750         dtlb_valids = TLBValidBitsArray()
 751
 752         comb = m.d.comb
 753         sync = m.d.sync
 754
 755         tlbie    = Signal()
 756         tlbwe    = Signal()
 757
 758         comb += tlbie.eq(r0_valid & r0.tlbie)
 759         comb += tlbwe.eq(r0_valid & r0.tlbld)
 760
 761         m.submodules.tlb_update = d = DTLBUpdate()
 762         with m.If(tlbie & r0.doall):
 763             # clear all valid bits at once
 764             for i in range(TLB_SET_SIZE):
 765                 sync += dtlb_valid_bits[i].eq(0)
 766         with m.If(d.updated):
 767             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 768             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 769         with m.If(d.v_updated):
 770             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 771
 772         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 773
 774         comb += d.tlbie.eq(tlbie)
 775         comb += d.tlbwe.eq(tlbwe)
 776         comb += d.doall.eq(r0.doall)
 777         comb += d.tlb_hit.eq(tlb_hit)
 778         comb += d.tlb_hit_way.eq(tlb_hit_way)
 779         comb += d.tlb_tag_way.eq(tlb_tag_way)
 780         comb += d.tlb_pte_way.eq(tlb_pte_way)
 781         comb += d.tlb_req_index.eq(tlb_req_index)
 782
 783         with m.If(tlb_hit):
 784             comb += d.repl_way.eq(tlb_hit_way)
 785         with m.Else():
 786             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 787         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 788         comb += d.pte_data.eq(r0.req.data)
 789
 790     def maybe_plrus(self, m, r1, plru_victim):
 791         """Generate PLRUs
 792         """
 793         comb = m.d.comb
 794         sync = m.d.sync
 795
 796         if TLB_NUM_WAYS == 0:
 797             return
 798
 799         for i in range(NUM_LINES):
 800             # PLRU interface
 801             plru        = PLRU(WAY_BITS)
 802             setattr(m.submodules, "plru%d" % i, plru)
 803             plru_acc_en = Signal()
 804
 805             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 806             comb += plru.acc_en.eq(plru_acc_en)
 807             comb += plru.acc_i.eq(r1.hit_way)
 808             comb += plru_victim[i].eq(plru.lru_o)
 809
 810     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 811         """Cache tag RAM read port
 812         """
 813         comb = m.d.comb
 814         sync = m.d.sync
 815         m_in, d_in = self.m_in, self.d_in
 816
 817         index = Signal(INDEX_BITS)
 818
 819         with m.If(r0_stall):
 820             comb += index.eq(req_index)
 821         with m.Elif(m_in.valid):
 822             comb += index.eq(get_index(m_in.addr))
 823         with m.Else():
 824             comb += index.eq(get_index(d_in.addr))
 825         sync += cache_tag_set.eq(cache_tags[index])
 826
 827     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 828                        r0_valid, r1, cache_valids, replace_way,
 829                        use_forward1_next, use_forward2_next,
 830                        req_hit_way, plru_victim, rc_ok, perm_attr,
 831                        valid_ra, perm_ok, access_ok, req_op, req_go,
 832                        tlb_pte_way,
 833                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 834                        cancel_store, req_same_tag, r0_stall, early_req_row):
 835         """Cache request parsing and hit detection
 836         """
 837
 838         comb = m.d.comb
 839         m_in, d_in = self.m_in, self.d_in
 840
 841         is_hit      = Signal()
 842         hit_way     = Signal(WAY_BITS)
 843         op          = Signal(Op)
 844         opsel       = Signal(3)
 845         go          = Signal()
 846         nc          = Signal()
 847         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 848                                   for i in range(TLB_NUM_WAYS))
 849         cache_valid_idx = Signal(NUM_WAYS)
 850
 851         # Extract line, row and tag from request
 852         comb += req_index.eq(get_index(r0.req.addr))
 853         comb += req_row.eq(get_row(r0.req.addr))
 854         comb += req_tag.eq(get_tag(ra))
 855
 856         if False: # display on comb is a bit... busy.
 857             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 858                     r0.req.addr, ra, req_index, req_tag, req_row)
 859
 860         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 861         comb += cache_valid_idx.eq(cache_valids[req_index])
 862
 863         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 864                                 tlb_valid_way, tlb_hit_way,
 865                                 cache_valid_idx, cache_tag_set,
 866                                 r0.req.addr,
 867                                 hit_set)
 868
 869         comb += dc.tlb_hit.eq(tlb_hit)
 870         comb += dc.reload_tag.eq(r1.reload_tag)
 871         comb += dc.virt_mode.eq(r0.req.virt_mode)
 872         comb += dc.go.eq(go)
 873         comb += dc.req_index.eq(req_index)
 874         comb += is_hit.eq(dc.is_hit)
 875         comb += hit_way.eq(dc.hit_way)
 876         comb += req_same_tag.eq(dc.rel_match)
 877
 878         # See if the request matches the line currently being reloaded
 879         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 880                   (req_index == r1.store_index) & req_same_tag):
 881             # For a store, consider this a hit even if the row isn't
 882             # valid since it will be by the time we perform the store.
 883             # For a load, check the appropriate row valid bit.
 884             rrow = Signal(ROW_LINE_BITS)
 885             comb += rrow.eq(req_row)
 886             valid = r1.rows_valid[rrow]
 887             comb += is_hit.eq((~r0.req.load) | valid)
 888             comb += hit_way.eq(replace_way)
 889
 890         # Whether to use forwarded data for a load or not
 891         with m.If((get_row(r1.req.real_addr) == req_row) &
 892                   (r1.req.hit_way == hit_way)):
 893             # Only need to consider r1.write_bram here, since if we
 894             # are writing refill data here, then we don't have a
 895             # cache hit this cycle on the line being refilled.
 896             # (There is the possibility that the load following the
 897             # load miss that started the refill could be to the old
 898             # contents of the victim line, since it is a couple of
 899             # cycles after the refill starts before we see the updated
 900             # cache tag. In that case we don't use the bypass.)
 901             comb += use_forward1_next.eq(r1.write_bram)
 902         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 903             comb += use_forward2_next.eq(r1.forward_valid1)
 904
 905         # The way that matched on a hit
 906         comb += req_hit_way.eq(hit_way)
 907
 908         # The way to replace on a miss
 909         with m.If(r1.write_tag):
 910             comb += replace_way.eq(plru_victim[r1.store_index])
 911         with m.Else():
 912             comb += replace_way.eq(r1.store_way)
 913
 914         # work out whether we have permission for this access
 915         # NB we don't yet implement AMR, thus no KUAP
 916         comb += rc_ok.eq(perm_attr.reference
 917                          & (r0.req.load | perm_attr.changed))
 918         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 919                            (perm_attr.wr_perm |
 920                               (r0.req.load & perm_attr.rd_perm)))
 921         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 922         # Combine the request and cache hit status to decide what
 923         # operation needs to be done
 924         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 925         comb += op.eq(Op.OP_NONE)
 926         with m.If(go):
 927             with m.If(~access_ok):
 928                 comb += op.eq(Op.OP_BAD)
 929             with m.Elif(cancel_store):
 930                 comb += op.eq(Op.OP_STCX_FAIL)
 931             with m.Else():
 932                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 933                 with m.Switch(opsel):
 934                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 935                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 936                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 937                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 938                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 939                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 940                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 941                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 942         comb += req_op.eq(op)
 943         comb += req_go.eq(go)
 944
 945         # Version of the row number that is valid one cycle earlier
 946         # in the cases where we need to read the cache data BRAM.
 947         # If we're stalling then we need to keep reading the last
 948         # row requested.
 949         with m.If(~r0_stall):
 950             with m.If(m_in.valid):
 951                 comb += early_req_row.eq(get_row(m_in.addr))
 952             with m.Else():
 953                 comb += early_req_row.eq(get_row(d_in.addr))
 954         with m.Else():
 955             comb += early_req_row.eq(req_row)
 956
 957     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 958                          r0_valid, r0, reservation):
 959         """Handle load-with-reservation and store-conditional instructions
 960         """
 961         comb = m.d.comb
 962
 963         with m.If(r0_valid & r0.req.reserve):
 964             # XXX generate alignment interrupt if address
 965             # is not aligned XXX or if r0.req.nc = '1'
 966             with m.If(r0.req.load):
 967                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
 968             with m.Else():
 969                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
 970                 with m.If((~reservation.valid) |
 971                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 972                     comb += cancel_store.eq(1)
 973
 974     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 975                         reservation, r0):
 976
 977         comb = m.d.comb
 978         sync = m.d.sync
 979
 980         with m.If(r0_valid & access_ok):
 981             with m.If(clear_rsrv):
 982                 sync += reservation.valid.eq(0)
 983             with m.Elif(set_rsrv):
 984                 sync += reservation.valid.eq(1)
 985                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 986
 987     def writeback_control(self, m, r1, cache_out_row):
 988         """Return data for loads & completion control logic
 989         """
 990         comb = m.d.comb
 991         sync = m.d.sync
 992         d_out, m_out = self.d_out, self.m_out
 993
 994         data_out = Signal(64)
 995         data_fwd = Signal(64)
 996
 997         # Use the bypass if are reading the row that was
 998         # written 1 or 2 cycles ago, including for the
 999         # slow_valid = 1 case (i.e. completing a load
1000         # miss or a non-cacheable load).
1001         with m.If(r1.use_forward1):
1002             comb += data_fwd.eq(r1.forward_data1)
1003         with m.Else():
1004             comb += data_fwd.eq(r1.forward_data2)
1005
1006         comb += data_out.eq(cache_out_row)
1007
1008         for i in range(8):
1009             with m.If(r1.forward_sel[i]):
1010                 dsel = data_fwd.word_select(i, 8)
1011                 comb += data_out.word_select(i, 8).eq(dsel)
1012
1013         comb += d_out.valid.eq(r1.ls_valid)
1014         comb += d_out.data.eq(data_out)
1015         comb += d_out.store_done.eq(~r1.stcx_fail)
1016         comb += d_out.error.eq(r1.ls_error)
1017         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1018
1019         # Outputs to MMU
1020         comb += m_out.done.eq(r1.mmu_done)
1021         comb += m_out.err.eq(r1.mmu_error)
1022         comb += m_out.data.eq(data_out)
1023
1024         # We have a valid load or store hit or we just completed
1025         # a slow op such as a load miss, a NC load or a store
1026         #
1027         # Note: the load hit is delayed by one cycle. However it
1028         # can still not collide with r.slow_valid (well unless I
1029         # miscalculated) because slow_valid can only be set on a
1030         # subsequent request and not on its first cycle (the state
1031         # machine must have advanced), which makes slow_valid
1032         # at least 2 cycles from the previous hit_load_valid.
1033
1034         # Sanity: Only one of these must be set in any given cycle
1035
1036         if False: # TODO: need Display to get this to work
1037             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1038             "unexpected slow_valid collision with stcx_fail"
1039
1040             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1041              "unexpected hit_load_delayed collision with slow_valid"
1042
1043         with m.If(~r1.mmu_req):
1044             # Request came from loadstore1...
1045             # Load hit case is the standard path
1046             with m.If(r1.hit_load_valid):
1047                 sync += Display("completing load hit data=%x", data_out)
1048
1049             # error cases complete without stalling
1050             with m.If(r1.ls_error):
1051                 sync += Display("completing ld/st with error")
1052
1053             # Slow ops (load miss, NC, stores)
1054             with m.If(r1.slow_valid):
1055                 sync += Display("completing store or load miss adr=%x data=%x",
1056                                 r1.req.real_addr, data_out)
1057
1058         with m.Else():
1059             # Request came from MMU
1060             with m.If(r1.hit_load_valid):
1061                 sync += Display("completing load hit to MMU, data=%x",
1062                                 m_out.data)
1063             # error cases complete without stalling
1064             with m.If(r1.mmu_error):
1065                 sync += Display("combpleting MMU ld with error")
1066
1067             # Slow ops (i.e. load miss)
1068             with m.If(r1.slow_valid):
1069                 sync += Display("completing MMU load miss, data=%x",
1070                                 m_out.data)
1071
1072     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1073         """rams
1074         Generate a cache RAM for each way. This handles the normal
1075         reads, writes from reloads and the special store-hit update
1076         path as well.
1077
1078         Note: the BRAMs have an extra read buffer, meaning the output
1079         is pipelined an extra cycle. This differs from the
1080         icache. The writeback logic needs to take that into
1081         account by using 1-cycle delayed signals for load hits.
1082         """
1083         comb = m.d.comb
1084         wb_in = self.wb_in
1085
1086         for i in range(NUM_WAYS):
1087             do_read  = Signal(name="do_rd%d" % i)
1088             rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
1089             do_write = Signal(name="do_wr%d" % i)
1090             wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
1091             wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
1092             wr_sel   = Signal(ROW_SIZE)
1093             wr_sel_m = Signal(ROW_SIZE)
1094             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1095
1096             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
1097             setattr(m.submodules, "cacheram_%d" % i, way)
1098
1099             comb += way.rd_en.eq(do_read)
1100             comb += way.rd_addr.eq(rd_addr)
1101             comb += _d_out.eq(way.rd_data_o)
1102             comb += way.wr_sel.eq(wr_sel_m)
1103             comb += way.wr_addr.eq(wr_addr)
1104             comb += way.wr_data.eq(wr_data)
1105
1106             # Cache hit reads
1107             comb += do_read.eq(1)
1108             comb += rd_addr.eq(early_req_row)
1109             with m.If(r1.hit_way == i):
1110                 comb += cache_out_row.eq(_d_out)
1111
1112             # Write mux:
1113             #
1114             # Defaults to wishbone read responses (cache refill)
1115             #
1116             # For timing, the mux on wr_data/sel/addr is not
1117             # dependent on anything other than the current state.
1118
1119             with m.If(r1.write_bram):
1120                 # Write store data to BRAM.  This happens one
1121                 # cycle after the store is in r0.
1122                 comb += wr_data.eq(r1.req.data)
1123                 comb += wr_sel.eq(r1.req.byte_sel)
1124                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1125
1126                 with m.If(i == r1.req.hit_way):
1127                     comb += do_write.eq(1)
1128             with m.Else():
1129                 # Otherwise, we might be doing a reload or a DCBZ
1130                 with m.If(r1.dcbz):
1131                     comb += wr_data.eq(0)
1132                 with m.Else():
1133                     comb += wr_data.eq(wb_in.dat)
1134                 comb += wr_addr.eq(r1.store_row)
1135                 comb += wr_sel.eq(~0) # all 1s
1136
1137                 with m.If((r1.state == State.RELOAD_WAIT_ACK)
1138                           & wb_in.ack & (replace_way == i)):
1139                     comb += do_write.eq(1)
1140
1141             # Mask write selects with do_write since BRAM
1142             # doesn't have a global write-enable
1143             with m.If(do_write):
1144                 comb += wr_sel_m.eq(wr_sel)
1145
1146     # Cache hit synchronous machine for the easy case.
1147     # This handles load hits.
1148     # It also handles error cases (TLB miss, cache paradox)
1149     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1150                         req_hit_way, req_index, req_tag, access_ok,
1151                         tlb_hit, tlb_hit_way, tlb_req_index):
1152
1153         comb = m.d.comb
1154         sync = m.d.sync
1155
1156         with m.If(req_op != Op.OP_NONE):
1157             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1158                     req_op, r0.req.addr, r0.req.nc,
1159                     req_index, req_tag, req_hit_way)
1160
1161         with m.If(r0_valid):
1162             sync += r1.mmu_req.eq(r0.mmu_req)
1163
1164         # Fast path for load/store hits.
1165         # Set signals for the writeback controls.
1166         sync += r1.hit_way.eq(req_hit_way)
1167         sync += r1.hit_index.eq(req_index)
1168
1169         with m.If(req_op == Op.OP_LOAD_HIT):
1170             sync += r1.hit_load_valid.eq(1)
1171         with m.Else():
1172             sync += r1.hit_load_valid.eq(0)
1173
1174         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1175             sync += r1.cache_hit.eq(1)
1176         with m.Else():
1177             sync += r1.cache_hit.eq(0)
1178
1179         with m.If(req_op == Op.OP_BAD):
1180             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1181             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1182             sync += r1.ls_error.eq(~r0.mmu_req)
1183             sync += r1.mmu_error.eq(r0.mmu_req)
1184             sync += r1.cache_paradox.eq(access_ok)
1185
1186             with m.Else():
1187                 sync += r1.ls_error.eq(0)
1188                 sync += r1.mmu_error.eq(0)
1189                 sync += r1.cache_paradox.eq(0)
1190
1191         with m.If(req_op == Op.OP_STCX_FAIL):
1192             sync += r1.stcx_fail.eq(1)
1193         with m.Else():
1194             sync += r1.stcx_fail.eq(0)
1195
1196         # Record TLB hit information for updating TLB PLRU
1197         sync += r1.tlb_hit.eq(tlb_hit)
1198         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1199         sync += r1.tlb_hit_index.eq(tlb_req_index)
1200
1201     # Memory accesses are handled by this state machine:
1202     #
1203     #   * Cache load miss/reload (in conjunction with "rams")
1204     #   * Load hits for non-cachable forms
1205     #   * Stores (the collision case is handled in "rams")
1206     #
1207     # All wishbone requests generation is done here.
1208     # This machine operates at stage 1.
1209     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1210                     cache_valids, r0, replace_way,
1211                     req_hit_way, req_same_tag,
1212                     r0_valid, req_op, cache_tags, req_go, ra):
1213
1214         comb = m.d.comb
1215         sync = m.d.sync
1216         wb_in = self.wb_in
1217         d_in = self.d_in
1218
1219         req         = MemAccessRequest("mreq_ds")
1220
1221         req_row = Signal(ROW_BITS)
1222         req_idx = Signal(INDEX_BITS)
1223         req_tag = Signal(TAG_BITS)
1224         comb += req_idx.eq(get_index(req.real_addr))
1225         comb += req_row.eq(get_row(req.real_addr))
1226         comb += req_tag.eq(get_tag(req.real_addr))
1227
1228         sync += r1.use_forward1.eq(use_forward1_next)
1229         sync += r1.forward_sel.eq(0)
1230
1231         with m.If(use_forward1_next):
1232             sync += r1.forward_sel.eq(r1.req.byte_sel)
1233         with m.Elif(use_forward2_next):
1234             sync += r1.forward_sel.eq(r1.forward_sel1)
1235
1236         sync += r1.forward_data2.eq(r1.forward_data1)
1237         with m.If(r1.write_bram):
1238             sync += r1.forward_data1.eq(r1.req.data)
1239             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1240             sync += r1.forward_way1.eq(r1.req.hit_way)
1241             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1242             sync += r1.forward_valid1.eq(1)
1243         with m.Else():
1244             with m.If(r1.dcbz):
1245                 sync += r1.forward_data1.eq(0)
1246             with m.Else():
1247                 sync += r1.forward_data1.eq(wb_in.dat)
1248             sync += r1.forward_sel1.eq(~0) # all 1s
1249             sync += r1.forward_way1.eq(replace_way)
1250             sync += r1.forward_row1.eq(r1.store_row)
1251             sync += r1.forward_valid1.eq(0)
1252
1253         # One cycle pulses reset
1254         sync += r1.slow_valid.eq(0)
1255         sync += r1.write_bram.eq(0)
1256         sync += r1.inc_acks.eq(0)
1257         sync += r1.dec_acks.eq(0)
1258
1259         sync += r1.ls_valid.eq(0)
1260         # complete tlbies and TLB loads in the third cycle
1261         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1262
1263         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1264             with m.If(~r0.mmu_req):
1265                 sync += r1.ls_valid.eq(1)
1266             with m.Else():
1267                 sync += r1.mmu_done.eq(1)
1268
1269         with m.If(r1.write_tag):
1270             # Store new tag in selected way
1271             for i in range(NUM_WAYS):
1272                 with m.If(i == replace_way):
1273                     ct = Signal(TAG_RAM_WIDTH)
1274                     comb += ct.eq(cache_tags[r1.store_index])
1275                     """
1276 TODO: check this
1277 cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
1278                     (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
1279                     """
1280                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1281                     sync += cache_tags[r1.store_index].eq(ct)
1282             sync += r1.store_way.eq(replace_way)
1283             sync += r1.write_tag.eq(0)
1284
1285         # Take request from r1.req if there is one there,
1286         # else from req_op, ra, etc.
1287         with m.If(r1.full):
1288             comb += req.eq(r1.req)
1289         with m.Else():
1290             comb += req.op.eq(req_op)
1291             comb += req.valid.eq(req_go)
1292             comb += req.mmu_req.eq(r0.mmu_req)
1293             comb += req.dcbz.eq(r0.req.dcbz)
1294             comb += req.real_addr.eq(ra)
1295
1296             with m.If(r0.req.dcbz):
1297                 # force data to 0 for dcbz
1298                 comb += req.data.eq(0)
1299             with m.Elif(r0.d_valid):
1300                 comb += req.data.eq(r0.req.data)
1301             with m.Else():
1302                 comb += req.data.eq(d_in.data)
1303
1304             # Select all bytes for dcbz
1305             # and for cacheable loads
1306             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1307                 comb += req.byte_sel.eq(~0) # all 1s
1308             with m.Else():
1309                 comb += req.byte_sel.eq(r0.req.byte_sel)
1310             comb += req.hit_way.eq(req_hit_way)
1311             comb += req.same_tag.eq(req_same_tag)
1312
1313             # Store the incoming request from r0,
1314             # if it is a slow request
1315             # Note that r1.full = 1 implies req_op = OP_NONE
1316             with m.If((req_op == Op.OP_LOAD_MISS)
1317                       | (req_op == Op.OP_LOAD_NC)
1318                       | (req_op == Op.OP_STORE_MISS)
1319                       | (req_op == Op.OP_STORE_HIT)):
1320                 sync += r1.req.eq(req)
1321                 sync += r1.full.eq(1)
1322
1323         # Main state machine
1324         with m.Switch(r1.state):
1325
1326             with m.Case(State.IDLE):
1327                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1328                 sync += r1.wb.sel.eq(req.byte_sel)
1329                 sync += r1.wb.dat.eq(req.data)
1330                 sync += r1.dcbz.eq(req.dcbz)
1331
1332                 # Keep track of our index and way
1333                 # for subsequent stores.
1334                 sync += r1.store_index.eq(req_idx)
1335                 sync += r1.store_row.eq(req_row)
1336                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1337                 sync += r1.reload_tag.eq(req_tag)
1338                 sync += r1.req.same_tag.eq(1)
1339
1340                 with m.If(req.op == Op.OP_STORE_HIT):
1341                     sync += r1.store_way.eq(req.hit_way)
1342
1343                 # Reset per-row valid bits,
1344                 # ready for handling OP_LOAD_MISS
1345                 for i in range(ROW_PER_LINE):
1346                     sync += r1.rows_valid[i].eq(0)
1347
1348                 with m.If(req_op != Op.OP_NONE):
1349                     sync += Display("cache op %d", req.op)
1350
1351                 with m.Switch(req.op):
1352                     with m.Case(Op.OP_LOAD_HIT):
1353                         # stay in IDLE state
1354                         pass
1355
1356                     with m.Case(Op.OP_LOAD_MISS):
1357                         sync += Display("cache miss real addr: %x " \
1358                                 "idx: %x tag: %x",
1359                                 req.real_addr, req_row, req_tag)
1360
1361                         # Start the wishbone cycle
1362                         sync += r1.wb.we.eq(0)
1363                         sync += r1.wb.cyc.eq(1)
1364                         sync += r1.wb.stb.eq(1)
1365
1366                         # Track that we had one request sent
1367                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1368                         sync += r1.write_tag.eq(1)
1369
1370                     with m.Case(Op.OP_LOAD_NC):
1371                         sync += r1.wb.cyc.eq(1)
1372                         sync += r1.wb.stb.eq(1)
1373                         sync += r1.wb.we.eq(0)
1374                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1375
1376                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1377                         with m.If(~req.dcbz):
1378                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1379                             sync += r1.acks_pending.eq(1)
1380                             sync += r1.full.eq(0)
1381                             sync += r1.slow_valid.eq(1)
1382
1383                             with m.If(~req.mmu_req):
1384                                 sync += r1.ls_valid.eq(1)
1385                             with m.Else():
1386                                 sync += r1.mmu_done.eq(1)
1387
1388                             with m.If(req.op == Op.OP_STORE_HIT):
1389                                 sync += r1.write_bram.eq(1)
1390                         with m.Else():
1391                             # dcbz is handled much like a load miss except
1392                             # that we are writing to memory instead of reading
1393                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1394
1395                             with m.If(req.op == Op.OP_STORE_MISS):
1396                                 sync += r1.write_tag.eq(1)
1397
1398                         sync += r1.wb.we.eq(1)
1399                         sync += r1.wb.cyc.eq(1)
1400                         sync += r1.wb.stb.eq(1)
1401
1402                     # OP_NONE and OP_BAD do nothing
1403                     # OP_BAD & OP_STCX_FAIL were
1404                     # handled above already
1405                     with m.Case(Op.OP_NONE):
1406                         pass
1407                     with m.Case(Op.OP_BAD):
1408                         pass
1409                     with m.Case(Op.OP_STCX_FAIL):
1410                         pass
1411
1412             with m.Case(State.RELOAD_WAIT_ACK):
1413                 ld_stbs_done = Signal()
1414                 # Requests are all sent if stb is 0
1415                 comb += ld_stbs_done.eq(~r1.wb.stb)
1416
1417                 # If we are still sending requests, was one accepted?
1418                 with m.If((~wb_in.stall) & r1.wb.stb):
1419                     # That was the last word?  We are done sending.
1420                     # Clear stb and set ld_stbs_done so we can handle an
1421                     # eventual last ack on the same cycle.
1422                     # sigh - reconstruct wb adr with 3 extra 0s at front
1423                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1424                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1425                         sync += r1.wb.stb.eq(0)
1426                         comb += ld_stbs_done.eq(1)
1427
1428                     # Calculate the next row address in the current cache line
1429                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1430                     comb += row.eq(r1.wb.adr)
1431                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1432
1433                 # Incoming acks processing
1434                 sync += r1.forward_valid1.eq(wb_in.ack)
1435                 with m.If(wb_in.ack):
1436                     srow = Signal(ROW_LINE_BITS)
1437                     comb += srow.eq(r1.store_row)
1438                     sync += r1.rows_valid[srow].eq(1)
1439
1440                     # If this is the data we were looking for,
1441                     # we can complete the request next cycle.
1442                     # Compare the whole address in case the
1443                     # request in r1.req is not the one that
1444                     # started this refill.
1445                     with m.If(req.valid & r1.req.same_tag &
1446                               ((r1.dcbz & r1.req.dcbz) |
1447                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1448                                 (r1.store_row == get_row(req.real_addr))):
1449                         sync += r1.full.eq(0)
1450                         sync += r1.slow_valid.eq(1)
1451                         with m.If(~r1.mmu_req):
1452                             sync += r1.ls_valid.eq(1)
1453                         with m.Else():
1454                             sync += r1.mmu_done.eq(1)
1455                         sync += r1.forward_sel.eq(~0) # all 1s
1456                         sync += r1.use_forward1.eq(1)
1457
1458                     # Check for completion
1459                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1460                                                       r1.end_row_ix)):
1461                         # Complete wishbone cycle
1462                         sync += r1.wb.cyc.eq(0)
1463
1464                         # Cache line is now valid
1465                         cv = Signal(INDEX_BITS)
1466                         comb += cv.eq(cache_valids[r1.store_index])
1467                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1468                         sync += cache_valids[r1.store_index].eq(cv)
1469
1470                         sync += r1.state.eq(State.IDLE)
1471
1472                     # Increment store row counter
1473                     sync += r1.store_row.eq(next_row(r1.store_row))
1474
1475             with m.Case(State.STORE_WAIT_ACK):
1476                 st_stbs_done = Signal()
1477                 acks        = Signal(3)
1478                 adjust_acks = Signal(3)
1479
1480                 comb += st_stbs_done.eq(~r1.wb.stb)
1481                 comb += acks.eq(r1.acks_pending)
1482
1483                 with m.If(r1.inc_acks != r1.dec_acks):
1484                     with m.If(r1.inc_acks):
1485                         comb += adjust_acks.eq(acks + 1)
1486                     with m.Else():
1487                         comb += adjust_acks.eq(acks - 1)
1488                 with m.Else():
1489                     comb += adjust_acks.eq(acks)
1490
1491                 sync += r1.acks_pending.eq(adjust_acks)
1492
1493                 # Clear stb when slave accepted request
1494                 with m.If(~wb_in.stall):
1495                     # See if there is another store waiting
1496                     # to be done which is in the same real page.
1497                     with m.If(req.valid):
1498                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1499                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1500                         sync += r1.wb.dat.eq(req.data)
1501                         sync += r1.wb.sel.eq(req.byte_sel)
1502
1503                     with m.If((adjust_acks < 7) & req.same_tag &
1504                                 ((req.op == Op.OP_STORE_MISS)
1505                                  | (req.op == Op.OP_STORE_HIT))):
1506                         sync += r1.wb.stb.eq(1)
1507                         comb += st_stbs_done.eq(0)
1508
1509                         with m.If(req.op == Op.OP_STORE_HIT):
1510                             sync += r1.write_bram.eq(1)
1511                         sync += r1.full.eq(0)
1512                         sync += r1.slow_valid.eq(1)
1513
1514                         # Store requests never come from the MMU
1515                         sync += r1.ls_valid.eq(1)
1516                         comb += st_stbs_done.eq(0)
1517                         sync += r1.inc_acks.eq(1)
1518                     with m.Else():
1519                         sync += r1.wb.stb.eq(0)
1520                         comb += st_stbs_done.eq(1)
1521
1522                 # Got ack ? See if complete.
1523                 with m.If(wb_in.ack):
1524                     with m.If(st_stbs_done & (adjust_acks == 1)):
1525                         sync += r1.state.eq(State.IDLE)
1526                         sync += r1.wb.cyc.eq(0)
1527                         sync += r1.wb.stb.eq(0)
1528                     sync += r1.dec_acks.eq(1)
1529
1530             with m.Case(State.NC_LOAD_WAIT_ACK):
1531                 # Clear stb when slave accepted request
1532                 with m.If(~wb_in.stall):
1533                     sync += r1.wb.stb.eq(0)
1534
1535                 # Got ack ? complete.
1536                 with m.If(wb_in.ack):
1537                     sync += r1.state.eq(State.IDLE)
1538                     sync += r1.full.eq(0)
1539                     sync += r1.slow_valid.eq(1)
1540
1541                     with m.If(~r1.mmu_req):
1542                         sync += r1.ls_valid.eq(1)
1543                     with m.Else():
1544                         sync += r1.mmu_done.eq(1)
1545
1546                     sync += r1.forward_sel.eq(~0) # all 1s
1547                     sync += r1.use_forward1.eq(1)
1548                     sync += r1.wb.cyc.eq(0)
1549                     sync += r1.wb.stb.eq(0)
1550
1551     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1552
1553         sync = m.d.sync
1554         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1555
1556         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1557                                stall_out, req_op[:3], d_out.valid, d_out.error,
1558                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1559                                r1.real_adr[3:6]))
1560
1561     def elaborate(self, platform):
1562
1563         m = Module()
1564         comb = m.d.comb
1565         d_in = self.d_in
1566
1567         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1568         cache_tags       = CacheTagArray()
1569         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1570         cache_valids = CacheValidBitsArray()
1571
1572         # TODO attribute ram_style : string;
1573         # TODO attribute ram_style of cache_tags : signal is "distributed";
1574
1575         """note: these are passed to nmigen.hdl.Memory as "attributes".
1576            don't know how, just that they are.
1577         """
1578         dtlb_valid_bits = TLBValidBitsArray()
1579         dtlb_tags       = TLBTagsArray()
1580         dtlb_ptes       = TLBPtesArray()
1581         # TODO attribute ram_style of
1582         #  dtlb_tags : signal is "distributed";
1583         # TODO attribute ram_style of
1584         #  dtlb_ptes : signal is "distributed";
1585
1586         r0      = RegStage0("r0")
1587         r0_full = Signal()
1588
1589         r1 = RegStage1("r1")
1590
1591         reservation = Reservation()
1592
1593         # Async signals on incoming request
1594         req_index    = Signal(INDEX_BITS)
1595         req_row      = Signal(ROW_BITS)
1596         req_hit_way  = Signal(WAY_BITS)
1597         req_tag      = Signal(TAG_BITS)
1598         req_op       = Signal(Op)
1599         req_data     = Signal(64)
1600         req_same_tag = Signal()
1601         req_go       = Signal()
1602
1603         early_req_row     = Signal(ROW_BITS)
1604
1605         cancel_store      = Signal()
1606         set_rsrv          = Signal()
1607         clear_rsrv        = Signal()
1608
1609         r0_valid          = Signal()
1610         r0_stall          = Signal()
1611
1612         use_forward1_next = Signal()
1613         use_forward2_next = Signal()
1614
1615         cache_out_row     = Signal(WB_DATA_BITS)
1616
1617         plru_victim       = PLRUOut()
1618         replace_way       = Signal(WAY_BITS)
1619
1620         # Wishbone read/write/cache write formatting signals
1621         bus_sel           = Signal(8)
1622
1623         # TLB signals
1624         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1625         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1626         tlb_valid_way = Signal(TLB_NUM_WAYS)
1627         tlb_req_index = Signal(TLB_SET_BITS)
1628         tlb_hit       = Signal()
1629         tlb_hit_way   = Signal(TLB_WAY_BITS)
1630         pte           = Signal(TLB_PTE_BITS)
1631         ra            = Signal(REAL_ADDR_BITS)
1632         valid_ra      = Signal()
1633         perm_attr     = PermAttr("dc_perms")
1634         rc_ok         = Signal()
1635         perm_ok       = Signal()
1636         access_ok     = Signal()
1637
1638         tlb_plru_victim = TLBPLRUOut()
1639
1640         # we don't yet handle collisions between loadstore1 requests
1641         # and MMU requests
1642         comb += self.m_out.stall.eq(0)
1643
1644         # Hold off the request in r0 when r1 has an uncompleted request
1645         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1646         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1647         comb += self.stall_out.eq(r0_stall)
1648
1649         # Wire up wishbone request latch out of stage 1
1650         comb += self.wb_out.eq(r1.wb)
1651
1652         # deal with litex not doing wishbone pipeline mode
1653         # XXX in wrong way.  FIFOs are needed in the SRAM test
1654         # so that stb/ack match up
1655         comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
1656
1657         # call sub-functions putting everything together, using shared
1658         # signals established above
1659         self.stage_0(m, r0, r1, r0_full)
1660         self.tlb_read(m, r0_stall, tlb_valid_way,
1661                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1662                       dtlb_tags, dtlb_ptes)
1663         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1664                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1665                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1666         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1667                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1668                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1669         self.maybe_plrus(m, r1, plru_victim)
1670         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1671         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1672         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1673                            r0_valid, r1, cache_valids, replace_way,
1674                            use_forward1_next, use_forward2_next,
1675                            req_hit_way, plru_victim, rc_ok, perm_attr,
1676                            valid_ra, perm_ok, access_ok, req_op, req_go,
1677                            tlb_pte_way,
1678                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1679                            cancel_store, req_same_tag, r0_stall, early_req_row)
1680         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1681                            r0_valid, r0, reservation)
1682         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1683                            reservation, r0)
1684         self.writeback_control(m, r1, cache_out_row)
1685         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1686         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1687                         req_hit_way, req_index, req_tag, access_ok,
1688                         tlb_hit, tlb_hit_way, tlb_req_index)
1689         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1690                     cache_valids, r0, replace_way,
1691                     req_hit_way, req_same_tag,
1692                          r0_valid, req_op, cache_tags, req_go, ra)
1693         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1694
1695         return m
1696
1697 def dcache_load(dut, addr, nc=0):
1698     yield dut.d_in.load.eq(1)
1699     yield dut.d_in.nc.eq(nc)
1700     yield dut.d_in.addr.eq(addr)
1701     yield dut.d_in.byte_sel.eq(~0)
1702     yield dut.d_in.valid.eq(1)
1703     yield
1704     yield dut.d_in.valid.eq(0)
1705     yield dut.d_in.byte_sel.eq(0)
1706     while not (yield dut.d_out.valid):
1707         yield
1708     # yield # data is valid one cycle AFTER valid goes hi? (no it isn't)
1709     data = yield dut.d_out.data
1710     return data
1711
1712
1713 def dcache_store(dut, addr, data, nc=0):
1714     yield dut.d_in.load.eq(0)
1715     yield dut.d_in.nc.eq(nc)
1716     yield dut.d_in.byte_sel.eq(~0)
1717     yield dut.d_in.addr.eq(addr)
1718     yield dut.d_in.valid.eq(1)
1719     yield
1720     yield dut.d_in.data.eq(data)    # leave set, but the cycle AFTER
1721     yield dut.d_in.valid.eq(0)
1722     yield dut.d_in.byte_sel.eq(0)
1723     while not (yield dut.d_out.valid):
1724         yield
1725
1726
1727 def dcache_random_sim(dut, mem, nc=0):
1728
1729     # start copy of mem
1730     sim_mem = deepcopy(mem)
1731     memsize = len(sim_mem)
1732     print ("mem len", memsize)
1733
1734     # clear stuff
1735     yield dut.d_in.valid.eq(0)
1736     yield dut.d_in.load.eq(0)
1737     yield dut.d_in.priv_mode.eq(1)
1738     yield dut.d_in.nc.eq(0)
1739     yield dut.d_in.addr.eq(0)
1740     yield dut.d_in.data.eq(0)
1741     yield dut.m_in.valid.eq(0)
1742     yield dut.m_in.addr.eq(0)
1743     yield dut.m_in.pte.eq(0)
1744     # wait 4 * clk_period
1745     yield
1746     yield
1747     yield
1748     yield
1749
1750     print ()
1751
1752     #for i in range(1024):
1753     #    sim_mem[i] = i
1754
1755     for i in range(1024):
1756         addr = randint(0, memsize-1)
1757         data = randint(0, (1<<64)-1)
1758         sim_mem[addr] = data
1759         row = addr
1760         addr *= 8
1761
1762         print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
1763
1764         yield from dcache_load(dut, addr, nc)
1765         yield from dcache_store(dut, addr, data, nc)
1766
1767         addr = randint(0, memsize-1)
1768         sim_data = sim_mem[addr]
1769         row = addr
1770         addr *= 8
1771
1772         print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1773         data = yield from dcache_load(dut, addr, nc)
1774         assert data == sim_data, \
1775             "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1776
1777     for addr in range(memsize):
1778         data = yield from dcache_load(dut, addr*8, nc)
1779         assert data == sim_mem[addr], \
1780             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1781
1782
1783 def dcache_regression_sim(dut, mem, nc=0):
1784
1785     # start copy of mem
1786     sim_mem = deepcopy(mem)
1787     memsize = len(sim_mem)
1788     print ("mem len", memsize)
1789
1790     # clear stuff
1791     yield dut.d_in.valid.eq(0)
1792     yield dut.d_in.load.eq(0)
1793     yield dut.d_in.priv_mode.eq(1)
1794     yield dut.d_in.nc.eq(0)
1795     yield dut.d_in.addr.eq(0)
1796     yield dut.d_in.data.eq(0)
1797     yield dut.m_in.valid.eq(0)
1798     yield dut.m_in.addr.eq(0)
1799     yield dut.m_in.pte.eq(0)
1800     # wait 4 * clk_period
1801     yield
1802     yield
1803     yield
1804     yield
1805
1806     addr = 0
1807     row = addr
1808     addr *= 8
1809
1810     print ("random testing %d 0x%x row %d" % (i, addr, row))
1811
1812     yield from dcache_load(dut, addr, nc)
1813
1814     addr = 2
1815     sim_data = sim_mem[addr]
1816     row = addr
1817     addr *= 8
1818
1819     print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1820     data = yield from dcache_load(dut, addr, nc)
1821     assert data == sim_data, \
1822         "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1823
1824
1825
1826 def dcache_sim(dut, mem):
1827     # clear stuff
1828     yield dut.d_in.valid.eq(0)
1829     yield dut.d_in.load.eq(0)
1830     yield dut.d_in.priv_mode.eq(1)
1831     yield dut.d_in.nc.eq(0)
1832     yield dut.d_in.addr.eq(0)
1833     yield dut.d_in.data.eq(0)
1834     yield dut.m_in.valid.eq(0)
1835     yield dut.m_in.addr.eq(0)
1836     yield dut.m_in.pte.eq(0)
1837     # wait 4 * clk_period
1838     yield
1839     yield
1840     yield
1841     yield
1842
1843     # Cacheable read of address 4
1844     data = yield from dcache_load(dut, 0x58)
1845     addr = yield dut.d_in.addr
1846     assert data == 0x0000001700000016, \
1847         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1848
1849     # Cacheable read of address 20
1850     data = yield from dcache_load(dut, 0x20)
1851     addr = yield dut.d_in.addr
1852     assert data == 0x0000000900000008, \
1853         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1854
1855     # Cacheable read of address 30
1856     data = yield from dcache_load(dut, 0x530)
1857     addr = yield dut.d_in.addr
1858     assert data == 0x0000014D0000014C, \
1859         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1860
1861     # 2nd Cacheable read of address 30
1862     data = yield from dcache_load(dut, 0x530)
1863     addr = yield dut.d_in.addr
1864     assert data == 0x0000014D0000014C, \
1865         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1866
1867     # Non-cacheable read of address 100
1868     data = yield from dcache_load(dut, 0x100, nc=1)
1869     addr = yield dut.d_in.addr
1870     assert data == 0x0000004100000040, \
1871         f"data @%x=%x expected 0000004100000040" % (addr, data)
1872
1873     # Store at address 530
1874     yield from dcache_store(dut, 0x530, 0x121)
1875
1876     # Store at address 30
1877     yield from dcache_store(dut, 0x530, 0x12345678)
1878
1879     # 3nd Cacheable read of address 530
1880     data = yield from dcache_load(dut, 0x530)
1881     addr = yield dut.d_in.addr
1882     assert data == 0x12345678, \
1883         f"data @%x=%x expected 0x12345678" % (addr, data)
1884
1885     # 4th Cacheable read of address 20
1886     data = yield from dcache_load(dut, 0x20)
1887     addr = yield dut.d_in.addr
1888     assert data == 0x0000000900000008, \
1889         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1890
1891     yield
1892     yield
1893     yield
1894     yield
1895
1896
1897 def test_dcache(mem, test_fn, test_name):
1898     dut = DCache()
1899
1900     memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
1901     sram = SRAM(memory=memory, granularity=8)
1902
1903     m = Module()
1904     m.submodules.dcache = dut
1905     m.submodules.sram = sram
1906
1907     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1908     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1909     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1910     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1911     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1912     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1913
1914     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1915     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1916
1917     dcache_write_gtkw(test_name)
1918
1919     # nmigen Simulation
1920     sim = Simulator(m)
1921     sim.add_clock(1e-6)
1922
1923     sim.add_sync_process(wrap(test_fn(dut, mem)))
1924     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1925         sim.run()
1926
1927
1928 def dcache_write_gtkw(test_name):
1929     traces = [
1930         'clk',
1931         ('d_in', [
1932             'd_in_load', 'd_in_nc', 'd_in_addr[63:0]', 'd_in_data[63:0]',
1933             'd_in_byte_sel[7:0]', 'd_in_valid'
1934         ]),
1935         ('d_out', [
1936             'd_out_valid', 'd_out_data[63:0]'
1937         ]),
1938         ('wb_out', [
1939             'wb_out_cyc', 'wb_out_stb', 'wb_out_we',
1940             'wb_out_adr[31:0]', 'wb_out_sel[7:0]', 'wb_out_dat[63:0]'
1941         ]),
1942         ('wb_in', [
1943             'wb_in_stall', 'wb_in_ack', 'wb_in_dat[63:0]'
1944         ])
1945     ]
1946     write_gtkw('test_dcache%s.gtkw' % test_name,
1947                'test_dcache%s.vcd' % test_name,
1948                traces, module='top.dcache')
1949
1950
1951 if __name__ == '__main__':
1952     seed(0)
1953     dut = DCache()
1954     vl = rtlil.convert(dut, ports=[])
1955     with open("test_dcache.il", "w") as f:
1956         f.write(vl)
1957
1958     mem = []
1959     memsize = 16
1960     for i in range(memsize):
1961         mem.append(i)
1962
1963     test_dcache(mem, dcache_regression_sim, "simpleregression")
1964
1965     mem = []
1966     memsize = 256
1967     for i in range(memsize):
1968         mem.append(i)
1969
1970     test_dcache(mem, dcache_random_sim, "random")
1971
1972     mem = []
1973     for i in range(1024):
1974         mem.append((i*2)| ((i*2+1)<<32))
1975
1976     test_dcache(mem, dcache_sim, "")
1977