src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8 """
   9
  10 import sys
  11
  12 from nmutil.gtkw import write_gtkw
  13
  14 sys.setrecursionlimit(1000000)
  15
  16 from enum import Enum, unique
  17
  18 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  19 from nmutil.util import Display
  20
  21 from copy import deepcopy
  22 from random import randint, seed
  23
  24 from nmigen.cli import main
  25 from nmutil.iocontrol import RecordObject
  26 from nmigen.utils import log2_int
  27 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  28                                      DCacheToLoadStore1Type,
  29                                      MMUToDCacheType,
  30                                      DCacheToMMUType)
  31
  32 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  33                                 WBAddrType, WBDataType, WBSelType,
  34                                 WBMasterOut, WBSlaveOut,
  35                                 WBMasterOutVector, WBSlaveOutVector,
  36                                 WBIOMasterOut, WBIOSlaveOut)
  37
  38 from soc.experiment.cache_ram import CacheRam
  39 #from soc.experiment.plru import PLRU
  40 from nmutil.plru import PLRU
  41
  42 # for test
  43 from soc.bus.sram import SRAM
  44 from nmigen import Memory
  45 from nmigen.cli import rtlil
  46
  47 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  48 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  49 from nmutil.sim_tmp_alternative import Simulator
  50
  51 from nmutil.util import wrap
  52
  53
  54 # TODO: make these parameters of DCache at some point
  55 LINE_SIZE = 64    # Line size in bytes
  56 NUM_LINES = 16    # Number of lines in a set
  57 NUM_WAYS = 4      # Number of ways
  58 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  59 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  60 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  61 LOG_LENGTH = 0    # Non-zero to enable log data collection
  62
  63 # BRAM organisation: We never access more than
  64 #     -- WB_DATA_BITS at a time so to save
  65 #     -- resources we make the array only that wide, and
  66 #     -- use consecutive indices for to make a cache "line"
  67 #     --
  68 #     -- ROW_SIZE is the width in bytes of the BRAM
  69 #     -- (based on WB, so 64-bits)
  70 ROW_SIZE = WB_DATA_BITS // 8;
  71
  72 # ROW_PER_LINE is the number of row (wishbone
  73 # transactions) in a line
  74 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  75
  76 # BRAM_ROWS is the number of rows in BRAM needed
  77 # to represent the full dcache
  78 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  79
  80 print ("ROW_SIZE", ROW_SIZE)
  81 print ("ROW_PER_LINE", ROW_PER_LINE)
  82 print ("BRAM_ROWS", BRAM_ROWS)
  83 print ("NUM_WAYS", NUM_WAYS)
  84
  85 # Bit fields counts in the address
  86
  87 # REAL_ADDR_BITS is the number of real address
  88 # bits that we store
  89 REAL_ADDR_BITS = 56
  90
  91 # ROW_BITS is the number of bits to select a row
  92 ROW_BITS = log2_int(BRAM_ROWS)
  93
  94 # ROW_LINE_BITS is the number of bits to select
  95 # a row within a line
  96 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  97
  98 # LINE_OFF_BITS is the number of bits for
  99 # the offset in a cache line
 100 LINE_OFF_BITS = log2_int(LINE_SIZE)
 101
 102 # ROW_OFF_BITS is the number of bits for
 103 # the offset in a row
 104 ROW_OFF_BITS = log2_int(ROW_SIZE)
 105
 106 # INDEX_BITS is the number if bits to
 107 # select a cache line
 108 INDEX_BITS = log2_int(NUM_LINES)
 109
 110 # SET_SIZE_BITS is the log base 2 of the set size
 111 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 112
 113 # TAG_BITS is the number of bits of
 114 # the tag part of the address
 115 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 116
 117 # TAG_WIDTH is the width in bits of each way of the tag RAM
 118 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 119
 120 # WAY_BITS is the number of bits to select a way
 121 WAY_BITS = log2_int(NUM_WAYS)
 122
 123 # Example of layout for 32 lines of 64 bytes:
 124 layout = """\
 125   ..  tag    |index|  line  |
 126   ..         |   row   |    |
 127   ..         |     |---|    | ROW_LINE_BITS  (3)
 128   ..         |     |--- - --| LINE_OFF_BITS (6)
 129   ..         |         |- --| ROW_OFF_BITS  (3)
 130   ..         |----- ---|    | ROW_BITS      (8)
 131   ..         |-----|        | INDEX_BITS    (5)
 132   .. --------|              | TAG_BITS      (45)
 133 """
 134 print (layout)
 135 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 136             (TAG_BITS, INDEX_BITS, ROW_BITS,
 137              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 138 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 139 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 140 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 141
 142 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 143
 144 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 145
 146 def CacheTagArray():
 147     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 148                         for x in range(NUM_LINES))
 149
 150 def CacheValidBitsArray():
 151     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 152                         for x in range(NUM_LINES))
 153
 154 def RowPerLineValidArray():
 155     return Array(Signal(name="rows_valid%d" % x) \
 156                         for x in range(ROW_PER_LINE))
 157
 158 # L1 TLB
 159 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 160 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 161 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 162 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 163 TLB_PTE_BITS     = 64
 164 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 165
 166 def ispow2(x):
 167     return (1<<log2_int(x, False)) == x
 168
 169 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 170 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 171 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 172 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 173 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 174 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 175         "geometry bits don't add up"
 176 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 177         "geometry bits don't add up"
 178 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 179          "geometry bits don't add up"
 180 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 181 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 182
 183
 184 def TLBValidBitsArray():
 185     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 186                 for x in range(TLB_SET_SIZE))
 187
 188 def TLBTagEAArray():
 189     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 190                 for x in range (TLB_NUM_WAYS))
 191
 192 def TLBTagsArray():
 193     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 194                 for x in range (TLB_SET_SIZE))
 195
 196 def TLBPtesArray():
 197     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 198                 for x in range(TLB_SET_SIZE))
 199
 200 def HitWaySet():
 201     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 202                         for x in range(TLB_NUM_WAYS))
 203
 204 # Cache RAM interface
 205 def CacheRamOut():
 206     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 207                  for x in range(NUM_WAYS))
 208
 209 # PLRU output interface
 210 def PLRUOut():
 211     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 212                 for x in range(NUM_LINES))
 213
 214 # TLB PLRU output interface
 215 def TLBPLRUOut():
 216     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 217                 for x in range(TLB_SET_SIZE))
 218
 219 # Helper functions to decode incoming requests
 220 #
 221 # Return the cache line index (tag index) for an address
 222 def get_index(addr):
 223     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 224
 225 # Return the cache row index (data memory) for an address
 226 def get_row(addr):
 227     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 228
 229 # Return the index of a row within a line
 230 def get_row_of_line(row):
 231     return row[:ROW_BITS][:ROW_LINE_BITS]
 232
 233 # Returns whether this is the last row of a line
 234 def is_last_row_addr(addr, last):
 235     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 236
 237 # Returns whether this is the last row of a line
 238 def is_last_row(row, last):
 239     return get_row_of_line(row) == last
 240
 241 # Return the next row in the current cache line. We use a
 242 # dedicated function in order to limit the size of the
 243 # generated adder to be only the bits within a cache line
 244 # (3 bits with default settings)
 245 def next_row(row):
 246     row_v = row[0:ROW_LINE_BITS] + 1
 247     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 248
 249 # Get the tag value from the address
 250 def get_tag(addr):
 251     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 252
 253 # Read a tag from a tag memory row
 254 def read_tag(way, tagset):
 255     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 256
 257 # Read a TLB tag from a TLB tag memory row
 258 def read_tlb_tag(way, tags):
 259     return tags.word_select(way, TLB_EA_TAG_BITS)
 260
 261 # Write a TLB tag to a TLB tag memory row
 262 def write_tlb_tag(way, tags, tag):
 263     return read_tlb_tag(way, tags).eq(tag)
 264
 265 # Read a PTE from a TLB PTE memory row
 266 def read_tlb_pte(way, ptes):
 267     return ptes.word_select(way, TLB_PTE_BITS)
 268
 269 def write_tlb_pte(way, ptes, newpte):
 270     return read_tlb_pte(way, ptes).eq(newpte)
 271
 272
 273 # Record for storing permission, attribute, etc. bits from a PTE
 274 class PermAttr(RecordObject):
 275     def __init__(self, name=None):
 276         super().__init__(name=name)
 277         self.reference = Signal()
 278         self.changed   = Signal()
 279         self.nocache   = Signal()
 280         self.priv      = Signal()
 281         self.rd_perm   = Signal()
 282         self.wr_perm   = Signal()
 283
 284
 285 def extract_perm_attr(pte):
 286     pa = PermAttr()
 287     return pa;
 288
 289
 290 # Type of operation on a "valid" input
 291 @unique
 292 class Op(Enum):
 293     OP_NONE       = 0
 294     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 295     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 296     OP_LOAD_HIT   = 3 # Cache hit on load
 297     OP_LOAD_MISS  = 4 # Load missing cache
 298     OP_LOAD_NC    = 5 # Non-cachable load
 299     OP_STORE_HIT  = 6 # Store hitting cache
 300     OP_STORE_MISS = 7 # Store missing cache
 301
 302
 303 # Cache state machine
 304 @unique
 305 class State(Enum):
 306     IDLE             = 0 # Normal load hit processing
 307     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 308     STORE_WAIT_ACK   = 2 # Store wait ack
 309     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 310
 311
 312 # Dcache operations:
 313 #
 314 # In order to make timing, we use the BRAMs with
 315 # an output buffer, which means that the BRAM
 316 # output is delayed by an extra cycle.
 317 #
 318 # Thus, the dcache has a 2-stage internal pipeline
 319 # for cache hits with no stalls.
 320 #
 321 # All other operations are handled via stalling
 322 # in the first stage.
 323 #
 324 # The second stage can thus complete a hit at the same
 325 # time as the first stage emits a stall for a complex op.
 326 #
 327 # Stage 0 register, basically contains just the latched request
 328
 329 class RegStage0(RecordObject):
 330     def __init__(self, name=None):
 331         super().__init__(name=name)
 332         self.req     = LoadStore1ToDCacheType(name="lsmem")
 333         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 334         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 335         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 336         self.mmu_req = Signal() # indicates source of request
 337         self.d_valid = Signal() # indicates req.data is valid now
 338
 339
 340 class MemAccessRequest(RecordObject):
 341     def __init__(self, name=None):
 342         super().__init__(name=name)
 343         self.op        = Signal(Op)
 344         self.valid     = Signal()
 345         self.dcbz      = Signal()
 346         self.real_addr = Signal(REAL_ADDR_BITS)
 347         self.data      = Signal(64)
 348         self.byte_sel  = Signal(8)
 349         self.hit_way   = Signal(WAY_BITS)
 350         self.same_tag  = Signal()
 351         self.mmu_req   = Signal()
 352
 353
 354 # First stage register, contains state for stage 1 of load hits
 355 # and for the state machine used by all other operations
 356 class RegStage1(RecordObject):
 357     def __init__(self, name=None):
 358         super().__init__(name=name)
 359         # Info about the request
 360         self.full             = Signal() # have uncompleted request
 361         self.mmu_req          = Signal() # request is from MMU
 362         self.req              = MemAccessRequest(name="reqmem")
 363
 364         # Cache hit state
 365         self.hit_way          = Signal(WAY_BITS)
 366         self.hit_load_valid   = Signal()
 367         self.hit_index        = Signal(INDEX_BITS)
 368         self.cache_hit        = Signal()
 369
 370         # TLB hit state
 371         self.tlb_hit          = Signal()
 372         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 373         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 374
 375         # 2-stage data buffer for data forwarded from writes to reads
 376         self.forward_data1    = Signal(64)
 377         self.forward_data2    = Signal(64)
 378         self.forward_sel1     = Signal(8)
 379         self.forward_valid1   = Signal()
 380         self.forward_way1     = Signal(WAY_BITS)
 381         self.forward_row1     = Signal(ROW_BITS)
 382         self.use_forward1     = Signal()
 383         self.forward_sel      = Signal(8)
 384
 385         # Cache miss state (reload state machine)
 386         self.state            = Signal(State)
 387         self.dcbz             = Signal()
 388         self.write_bram       = Signal()
 389         self.write_tag        = Signal()
 390         self.slow_valid       = Signal()
 391         self.wb               = WBMasterOut("wb")
 392         self.reload_tag       = Signal(TAG_BITS)
 393         self.store_way        = Signal(WAY_BITS)
 394         self.store_row        = Signal(ROW_BITS)
 395         self.store_index      = Signal(INDEX_BITS)
 396         self.end_row_ix       = Signal(ROW_LINE_BITS)
 397         self.rows_valid       = RowPerLineValidArray()
 398         self.acks_pending     = Signal(3)
 399         self.inc_acks         = Signal()
 400         self.dec_acks         = Signal()
 401
 402         # Signals to complete (possibly with error)
 403         self.ls_valid         = Signal()
 404         self.ls_error         = Signal()
 405         self.mmu_done         = Signal()
 406         self.mmu_error        = Signal()
 407         self.cache_paradox    = Signal()
 408
 409         # Signal to complete a failed stcx.
 410         self.stcx_fail        = Signal()
 411
 412
 413 # Reservation information
 414 class Reservation(RecordObject):
 415     def __init__(self):
 416         super().__init__()
 417         self.valid = Signal()
 418         self.addr  = Signal(64-LINE_OFF_BITS)
 419
 420
 421 class DTLBUpdate(Elaboratable):
 422     def __init__(self):
 423         self.tlbie    = Signal()
 424         self.tlbwe    = Signal()
 425         self.doall    = Signal()
 426         self.updated  = Signal()
 427         self.v_updated  = Signal()
 428         self.tlb_hit    = Signal()
 429         self.tlb_req_index = Signal(TLB_SET_BITS)
 430
 431         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 432         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 433         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 434         self.repl_way        = Signal(TLB_WAY_BITS)
 435         self.eatag           = Signal(TLB_EA_TAG_BITS)
 436         self.pte_data        = Signal(TLB_PTE_BITS)
 437
 438         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 439
 440         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 441         self.pb_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 442         self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 443
 444     def elaborate(self, platform):
 445         m = Module()
 446         comb = m.d.comb
 447         sync = m.d.sync
 448
 449         tagset   = Signal(TLB_TAG_WAY_BITS)
 450         pteset   = Signal(TLB_PTE_WAY_BITS)
 451
 452         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 453         comb += db_out.eq(self.dv)
 454
 455         with m.If(self.tlbie & self.doall):
 456             pass # clear all back in parent
 457         with m.Elif(self.tlbie):
 458             with m.If(self.tlb_hit):
 459                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 460                 comb += self.v_updated.eq(1)
 461
 462         with m.Elif(self.tlbwe):
 463
 464             comb += tagset.eq(self.tlb_tag_way)
 465             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 466             comb += tb_out.eq(tagset)
 467
 468             comb += pteset.eq(self.tlb_pte_way)
 469             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 470             comb += pb_out.eq(pteset)
 471
 472             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 473
 474             comb += self.updated.eq(1)
 475             comb += self.v_updated.eq(1)
 476
 477         return m
 478
 479
 480 class DCachePendingHit(Elaboratable):
 481
 482     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 483                       cache_valid_idx, cache_tag_set,
 484                     req_addr,
 485                     hit_set):
 486
 487         self.go          = Signal()
 488         self.virt_mode   = Signal()
 489         self.is_hit      = Signal()
 490         self.tlb_hit     = Signal()
 491         self.hit_way     = Signal(WAY_BITS)
 492         self.rel_match   = Signal()
 493         self.req_index   = Signal(INDEX_BITS)
 494         self.reload_tag  = Signal(TAG_BITS)
 495
 496         self.tlb_hit_way = tlb_hit_way
 497         self.tlb_pte_way = tlb_pte_way
 498         self.tlb_valid_way = tlb_valid_way
 499         self.cache_valid_idx = cache_valid_idx
 500         self.cache_tag_set = cache_tag_set
 501         self.req_addr = req_addr
 502         self.hit_set = hit_set
 503
 504     def elaborate(self, platform):
 505         m = Module()
 506         comb = m.d.comb
 507         sync = m.d.sync
 508
 509         go = self.go
 510         virt_mode = self.virt_mode
 511         is_hit = self.is_hit
 512         tlb_pte_way = self.tlb_pte_way
 513         tlb_valid_way = self.tlb_valid_way
 514         cache_valid_idx = self.cache_valid_idx
 515         cache_tag_set = self.cache_tag_set
 516         req_addr = self.req_addr
 517         tlb_hit_way = self.tlb_hit_way
 518         tlb_hit = self.tlb_hit
 519         hit_set = self.hit_set
 520         hit_way = self.hit_way
 521         rel_match = self.rel_match
 522         req_index = self.req_index
 523         reload_tag = self.reload_tag
 524
 525         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 526                                     for i in range(TLB_NUM_WAYS))
 527         hit_way_set = HitWaySet()
 528
 529         # Test if pending request is a hit on any way
 530         # In order to make timing in virtual mode,
 531         # when we are using the TLB, we compare each
 532         # way with each of the real addresses from each way of
 533         # the TLB, and then decide later which match to use.
 534
 535         with m.If(virt_mode):
 536             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 537                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 538                 s_hit       = Signal()
 539                 s_pte       = Signal(TLB_PTE_BITS)
 540                 s_ra        = Signal(REAL_ADDR_BITS)
 541                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 542                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 543                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 544                 comb += s_tag.eq(get_tag(s_ra))
 545
 546                 for i in range(NUM_WAYS): # way_t
 547                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 548                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 549                                   (read_tag(i, cache_tag_set) == s_tag)
 550                                   & tlb_valid_way[j])
 551                     with m.If(is_tag_hit):
 552                         comb += hit_way_set[j].eq(i)
 553                         comb += s_hit.eq(1)
 554                 comb += hit_set[j].eq(s_hit)
 555                 with m.If(s_tag == reload_tag):
 556                     comb += rel_matches[j].eq(1)
 557             with m.If(tlb_hit):
 558                 comb += is_hit.eq(hit_set[tlb_hit_way])
 559                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 560                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 561         with m.Else():
 562             s_tag       = Signal(TAG_BITS)
 563             comb += s_tag.eq(get_tag(req_addr))
 564             for i in range(NUM_WAYS): # way_t
 565                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 566                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 567                           (read_tag(i, cache_tag_set) == s_tag))
 568                 with m.If(is_tag_hit):
 569                     comb += hit_way.eq(i)
 570                     comb += is_hit.eq(1)
 571             with m.If(s_tag == reload_tag):
 572                 comb += rel_match.eq(1)
 573
 574         return m
 575
 576
 577 class DCache(Elaboratable):
 578     """Set associative dcache write-through
 579     TODO (in no specific order):
 580     * See list in icache.vhdl
 581     * Complete load misses on the cycle when WB data comes instead of
 582       at the end of line (this requires dealing with requests coming in
 583       while not idle...)
 584     """
 585     def __init__(self):
 586         self.d_in      = LoadStore1ToDCacheType("d_in")
 587         self.d_out     = DCacheToLoadStore1Type("d_out")
 588
 589         self.m_in      = MMUToDCacheType("m_in")
 590         self.m_out     = DCacheToMMUType("m_out")
 591
 592         self.stall_out = Signal()
 593
 594         self.wb_out    = WBMasterOut("wb_out")
 595         self.wb_in     = WBSlaveOut("wb_in")
 596
 597         self.log_out   = Signal(20)
 598
 599     def stage_0(self, m, r0, r1, r0_full):
 600         """Latch the request in r0.req as long as we're not stalling
 601         """
 602         comb = m.d.comb
 603         sync = m.d.sync
 604         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 605
 606         r = RegStage0("stage0")
 607
 608         # TODO, this goes in unit tests and formal proofs
 609         with m.If(d_in.valid & m_in.valid):
 610             sync += Display("request collision loadstore vs MMU")
 611
 612         with m.If(m_in.valid):
 613             comb += r.req.valid.eq(1)
 614             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 615             comb += r.req.dcbz.eq(0)
 616             comb += r.req.nc.eq(0)
 617             comb += r.req.reserve.eq(0)
 618             comb += r.req.virt_mode.eq(0)
 619             comb += r.req.priv_mode.eq(1)
 620             comb += r.req.addr.eq(m_in.addr)
 621             comb += r.req.data.eq(m_in.pte)
 622             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 623             comb += r.tlbie.eq(m_in.tlbie)
 624             comb += r.doall.eq(m_in.doall)
 625             comb += r.tlbld.eq(m_in.tlbld)
 626             comb += r.mmu_req.eq(1)
 627         with m.Else():
 628             comb += r.req.eq(d_in)
 629             comb += r.req.data.eq(0)
 630             comb += r.tlbie.eq(0)
 631             comb += r.doall.eq(0)
 632             comb += r.tlbld.eq(0)
 633             comb += r.mmu_req.eq(0)
 634         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 635             sync += r0.eq(r)
 636             sync += r0_full.eq(r.req.valid)
 637             # Sample data the cycle after a request comes in from loadstore1.
 638             # If another request has come in already then the data will get
 639             # put directly into req.data below.
 640             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 641                      ~r0.mmu_req):
 642                 sync += r0.req.data.eq(d_in.data)
 643                 sync += r0.d_valid.eq(1)
 644
 645     def tlb_read(self, m, r0_stall, tlb_valid_way,
 646                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 647                  dtlb_tags, dtlb_ptes):
 648         """TLB
 649         Operates in the second cycle on the request latched in r0.req.
 650         TLB updates write the entry at the end of the second cycle.
 651         """
 652         comb = m.d.comb
 653         sync = m.d.sync
 654         m_in, d_in = self.m_in, self.d_in
 655
 656         index    = Signal(TLB_SET_BITS)
 657         addrbits = Signal(TLB_SET_BITS)
 658
 659         amin = TLB_LG_PGSZ
 660         amax = TLB_LG_PGSZ + TLB_SET_BITS
 661
 662         with m.If(m_in.valid):
 663             comb += addrbits.eq(m_in.addr[amin : amax])
 664         with m.Else():
 665             comb += addrbits.eq(d_in.addr[amin : amax])
 666         comb += index.eq(addrbits)
 667
 668         # If we have any op and the previous op isn't finished,
 669         # then keep the same output for next cycle.
 670         with m.If(~r0_stall):
 671             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 672             sync += tlb_tag_way.eq(dtlb_tags[index])
 673             sync += tlb_pte_way.eq(dtlb_ptes[index])
 674
 675     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 676         """Generate TLB PLRUs
 677         """
 678         comb = m.d.comb
 679         sync = m.d.sync
 680
 681         if TLB_NUM_WAYS == 0:
 682             return
 683         for i in range(TLB_SET_SIZE):
 684             # TLB PLRU interface
 685             tlb_plru        = PLRU(TLB_WAY_BITS)
 686             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 687             tlb_plru_acc_en = Signal()
 688
 689             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 690             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 691             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 692             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 693
 694     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 695                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 696                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 697
 698         comb = m.d.comb
 699
 700         hitway = Signal(TLB_WAY_BITS)
 701         hit    = Signal()
 702         eatag  = Signal(TLB_EA_TAG_BITS)
 703
 704         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 705         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 706         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 707
 708         for i in range(TLB_NUM_WAYS):
 709             is_tag_hit = Signal()
 710             comb += is_tag_hit.eq(tlb_valid_way[i]
 711                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 712             with m.If(is_tag_hit):
 713                 comb += hitway.eq(i)
 714                 comb += hit.eq(1)
 715
 716         comb += tlb_hit.eq(hit & r0_valid)
 717         comb += tlb_hit_way.eq(hitway)
 718
 719         with m.If(tlb_hit):
 720             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 721         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 722
 723         with m.If(r0.req.virt_mode):
 724             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 725                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 726                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 727             comb += perm_attr.reference.eq(pte[8])
 728             comb += perm_attr.changed.eq(pte[7])
 729             comb += perm_attr.nocache.eq(pte[5])
 730             comb += perm_attr.priv.eq(pte[3])
 731             comb += perm_attr.rd_perm.eq(pte[2])
 732             comb += perm_attr.wr_perm.eq(pte[1])
 733         with m.Else():
 734             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 735                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 736             comb += perm_attr.reference.eq(1)
 737             comb += perm_attr.changed.eq(1)
 738             comb += perm_attr.nocache.eq(0)
 739             comb += perm_attr.priv.eq(1)
 740             comb += perm_attr.rd_perm.eq(1)
 741             comb += perm_attr.wr_perm.eq(1)
 742
 743     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 744                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 745                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 746
 747         dtlb_valids = TLBValidBitsArray()
 748
 749         comb = m.d.comb
 750         sync = m.d.sync
 751
 752         tlbie    = Signal()
 753         tlbwe    = Signal()
 754
 755         comb += tlbie.eq(r0_valid & r0.tlbie)
 756         comb += tlbwe.eq(r0_valid & r0.tlbld)
 757
 758         m.submodules.tlb_update = d = DTLBUpdate()
 759         with m.If(tlbie & r0.doall):
 760             # clear all valid bits at once
 761             for i in range(TLB_SET_SIZE):
 762                 sync += dtlb_valid_bits[i].eq(0)
 763         with m.If(d.updated):
 764             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 765             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 766         with m.If(d.v_updated):
 767             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 768
 769         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 770
 771         comb += d.tlbie.eq(tlbie)
 772         comb += d.tlbwe.eq(tlbwe)
 773         comb += d.doall.eq(r0.doall)
 774         comb += d.tlb_hit.eq(tlb_hit)
 775         comb += d.tlb_hit_way.eq(tlb_hit_way)
 776         comb += d.tlb_tag_way.eq(tlb_tag_way)
 777         comb += d.tlb_pte_way.eq(tlb_pte_way)
 778         comb += d.tlb_req_index.eq(tlb_req_index)
 779
 780         with m.If(tlb_hit):
 781             comb += d.repl_way.eq(tlb_hit_way)
 782         with m.Else():
 783             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 784         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 785         comb += d.pte_data.eq(r0.req.data)
 786
 787     def maybe_plrus(self, m, r1, plru_victim):
 788         """Generate PLRUs
 789         """
 790         comb = m.d.comb
 791         sync = m.d.sync
 792
 793         if TLB_NUM_WAYS == 0:
 794             return
 795
 796         for i in range(NUM_LINES):
 797             # PLRU interface
 798             plru        = PLRU(WAY_BITS)
 799             setattr(m.submodules, "plru%d" % i, plru)
 800             plru_acc_en = Signal()
 801
 802             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 803             comb += plru.acc_en.eq(plru_acc_en)
 804             comb += plru.acc_i.eq(r1.hit_way)
 805             comb += plru_victim[i].eq(plru.lru_o)
 806
 807     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 808         """Cache tag RAM read port
 809         """
 810         comb = m.d.comb
 811         sync = m.d.sync
 812         m_in, d_in = self.m_in, self.d_in
 813
 814         index = Signal(INDEX_BITS)
 815
 816         with m.If(r0_stall):
 817             comb += index.eq(req_index)
 818         with m.Elif(m_in.valid):
 819             comb += index.eq(get_index(m_in.addr))
 820         with m.Else():
 821             comb += index.eq(get_index(d_in.addr))
 822         sync += cache_tag_set.eq(cache_tags[index])
 823
 824     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 825                        r0_valid, r1, cache_valids, replace_way,
 826                        use_forward1_next, use_forward2_next,
 827                        req_hit_way, plru_victim, rc_ok, perm_attr,
 828                        valid_ra, perm_ok, access_ok, req_op, req_go,
 829                        tlb_pte_way,
 830                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 831                        cancel_store, req_same_tag, r0_stall, early_req_row):
 832         """Cache request parsing and hit detection
 833         """
 834
 835         comb = m.d.comb
 836         m_in, d_in = self.m_in, self.d_in
 837
 838         is_hit      = Signal()
 839         hit_way     = Signal(WAY_BITS)
 840         op          = Signal(Op)
 841         opsel       = Signal(3)
 842         go          = Signal()
 843         nc          = Signal()
 844         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 845                                   for i in range(TLB_NUM_WAYS))
 846         cache_valid_idx = Signal(NUM_WAYS)
 847
 848         # Extract line, row and tag from request
 849         comb += req_index.eq(get_index(r0.req.addr))
 850         comb += req_row.eq(get_row(r0.req.addr))
 851         comb += req_tag.eq(get_tag(ra))
 852
 853         if False: # display on comb is a bit... busy.
 854             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 855                     r0.req.addr, ra, req_index, req_tag, req_row)
 856
 857         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 858         comb += cache_valid_idx.eq(cache_valids[req_index])
 859
 860         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 861                                 tlb_valid_way, tlb_hit_way,
 862                                 cache_valid_idx, cache_tag_set,
 863                                 r0.req.addr,
 864                                 hit_set)
 865
 866         comb += dc.tlb_hit.eq(tlb_hit)
 867         comb += dc.reload_tag.eq(r1.reload_tag)
 868         comb += dc.virt_mode.eq(r0.req.virt_mode)
 869         comb += dc.go.eq(go)
 870         comb += dc.req_index.eq(req_index)
 871         comb += is_hit.eq(dc.is_hit)
 872         comb += hit_way.eq(dc.hit_way)
 873         comb += req_same_tag.eq(dc.rel_match)
 874
 875         # See if the request matches the line currently being reloaded
 876         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 877                   (req_index == r1.store_index) & req_same_tag):
 878             # For a store, consider this a hit even if the row isn't
 879             # valid since it will be by the time we perform the store.
 880             # For a load, check the appropriate row valid bit.
 881             rrow = Signal(ROW_LINE_BITS)
 882             comb += rrow.eq(req_row)
 883             valid = r1.rows_valid[rrow]
 884             comb += is_hit.eq((~r0.req.load) | valid)
 885             comb += hit_way.eq(replace_way)
 886
 887         # Whether to use forwarded data for a load or not
 888         with m.If((get_row(r1.req.real_addr) == req_row) &
 889                   (r1.req.hit_way == hit_way)):
 890             # Only need to consider r1.write_bram here, since if we
 891             # are writing refill data here, then we don't have a
 892             # cache hit this cycle on the line being refilled.
 893             # (There is the possibility that the load following the
 894             # load miss that started the refill could be to the old
 895             # contents of the victim line, since it is a couple of
 896             # cycles after the refill starts before we see the updated
 897             # cache tag. In that case we don't use the bypass.)
 898             comb += use_forward1_next.eq(r1.write_bram)
 899         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 900             comb += use_forward2_next.eq(r1.forward_valid1)
 901
 902         # The way that matched on a hit
 903         comb += req_hit_way.eq(hit_way)
 904
 905         # The way to replace on a miss
 906         with m.If(r1.write_tag):
 907             comb += replace_way.eq(plru_victim[r1.store_index])
 908         with m.Else():
 909             comb += replace_way.eq(r1.store_way)
 910
 911         # work out whether we have permission for this access
 912         # NB we don't yet implement AMR, thus no KUAP
 913         comb += rc_ok.eq(perm_attr.reference
 914                          & (r0.req.load | perm_attr.changed))
 915         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 916                            (perm_attr.wr_perm |
 917                               (r0.req.load & perm_attr.rd_perm)))
 918         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 919         # Combine the request and cache hit status to decide what
 920         # operation needs to be done
 921         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 922         comb += op.eq(Op.OP_NONE)
 923         with m.If(go):
 924             with m.If(~access_ok):
 925                 comb += op.eq(Op.OP_BAD)
 926             with m.Elif(cancel_store):
 927                 comb += op.eq(Op.OP_STCX_FAIL)
 928             with m.Else():
 929                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 930                 with m.Switch(opsel):
 931                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 932                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 933                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 934                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 935                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 936                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 937                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 938                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 939         comb += req_op.eq(op)
 940         comb += req_go.eq(go)
 941
 942         # Version of the row number that is valid one cycle earlier
 943         # in the cases where we need to read the cache data BRAM.
 944         # If we're stalling then we need to keep reading the last
 945         # row requested.
 946         with m.If(~r0_stall):
 947             with m.If(m_in.valid):
 948                 comb += early_req_row.eq(get_row(m_in.addr))
 949             with m.Else():
 950                 comb += early_req_row.eq(get_row(d_in.addr))
 951         with m.Else():
 952             comb += early_req_row.eq(req_row)
 953
 954     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 955                          r0_valid, r0, reservation):
 956         """Handle load-with-reservation and store-conditional instructions
 957         """
 958         comb = m.d.comb
 959
 960         with m.If(r0_valid & r0.req.reserve):
 961             # XXX generate alignment interrupt if address
 962             # is not aligned XXX or if r0.req.nc = '1'
 963             with m.If(r0.req.load):
 964                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
 965             with m.Else():
 966                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
 967                 with m.If((~reservation.valid) |
 968                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 969                     comb += cancel_store.eq(1)
 970
 971     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 972                         reservation, r0):
 973
 974         comb = m.d.comb
 975         sync = m.d.sync
 976
 977         with m.If(r0_valid & access_ok):
 978             with m.If(clear_rsrv):
 979                 sync += reservation.valid.eq(0)
 980             with m.Elif(set_rsrv):
 981                 sync += reservation.valid.eq(1)
 982                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 983
 984     def writeback_control(self, m, r1, cache_out_row):
 985         """Return data for loads & completion control logic
 986         """
 987         comb = m.d.comb
 988         sync = m.d.sync
 989         d_out, m_out = self.d_out, self.m_out
 990
 991         data_out = Signal(64)
 992         data_fwd = Signal(64)
 993
 994         # Use the bypass if are reading the row that was
 995         # written 1 or 2 cycles ago, including for the
 996         # slow_valid = 1 case (i.e. completing a load
 997         # miss or a non-cacheable load).
 998         with m.If(r1.use_forward1):
 999             comb += data_fwd.eq(r1.forward_data1)
1000         with m.Else():
1001             comb += data_fwd.eq(r1.forward_data2)
1002
1003         comb += data_out.eq(cache_out_row)
1004
1005         for i in range(8):
1006             with m.If(r1.forward_sel[i]):
1007                 dsel = data_fwd.word_select(i, 8)
1008                 comb += data_out.word_select(i, 8).eq(dsel)
1009
1010         comb += d_out.valid.eq(r1.ls_valid)
1011         comb += d_out.data.eq(data_out)
1012         comb += d_out.store_done.eq(~r1.stcx_fail)
1013         comb += d_out.error.eq(r1.ls_error)
1014         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1015
1016         # Outputs to MMU
1017         comb += m_out.done.eq(r1.mmu_done)
1018         comb += m_out.err.eq(r1.mmu_error)
1019         comb += m_out.data.eq(data_out)
1020
1021         # We have a valid load or store hit or we just completed
1022         # a slow op such as a load miss, a NC load or a store
1023         #
1024         # Note: the load hit is delayed by one cycle. However it
1025         # can still not collide with r.slow_valid (well unless I
1026         # miscalculated) because slow_valid can only be set on a
1027         # subsequent request and not on its first cycle (the state
1028         # machine must have advanced), which makes slow_valid
1029         # at least 2 cycles from the previous hit_load_valid.
1030
1031         # Sanity: Only one of these must be set in any given cycle
1032
1033         if False: # TODO: need Display to get this to work
1034             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1035             "unexpected slow_valid collision with stcx_fail"
1036
1037             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1038              "unexpected hit_load_delayed collision with slow_valid"
1039
1040         with m.If(~r1.mmu_req):
1041             # Request came from loadstore1...
1042             # Load hit case is the standard path
1043             with m.If(r1.hit_load_valid):
1044                 sync += Display("completing load hit data=%x", data_out)
1045
1046             # error cases complete without stalling
1047             with m.If(r1.ls_error):
1048                 sync += Display("completing ld/st with error")
1049
1050             # Slow ops (load miss, NC, stores)
1051             with m.If(r1.slow_valid):
1052                 sync += Display("completing store or load miss adr=%x data=%x",
1053                                 r1.req.real_addr, data_out)
1054
1055         with m.Else():
1056             # Request came from MMU
1057             with m.If(r1.hit_load_valid):
1058                 sync += Display("completing load hit to MMU, data=%x",
1059                                 m_out.data)
1060             # error cases complete without stalling
1061             with m.If(r1.mmu_error):
1062                 sync += Display("combpleting MMU ld with error")
1063
1064             # Slow ops (i.e. load miss)
1065             with m.If(r1.slow_valid):
1066                 sync += Display("completing MMU load miss, data=%x",
1067                                 m_out.data)
1068
1069     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1070         """rams
1071         Generate a cache RAM for each way. This handles the normal
1072         reads, writes from reloads and the special store-hit update
1073         path as well.
1074
1075         Note: the BRAMs have an extra read buffer, meaning the output
1076         is pipelined an extra cycle. This differs from the
1077         icache. The writeback logic needs to take that into
1078         account by using 1-cycle delayed signals for load hits.
1079         """
1080         comb = m.d.comb
1081         wb_in = self.wb_in
1082
1083         for i in range(NUM_WAYS):
1084             do_read  = Signal(name="do_rd%d" % i)
1085             rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
1086             do_write = Signal(name="do_wr%d" % i)
1087             wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
1088             wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
1089             wr_sel   = Signal(ROW_SIZE)
1090             wr_sel_m = Signal(ROW_SIZE)
1091             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1092
1093             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
1094             setattr(m.submodules, "cacheram_%d" % i, way)
1095
1096             comb += way.rd_en.eq(do_read)
1097             comb += way.rd_addr.eq(rd_addr)
1098             comb += _d_out.eq(way.rd_data_o)
1099             comb += way.wr_sel.eq(wr_sel_m)
1100             comb += way.wr_addr.eq(wr_addr)
1101             comb += way.wr_data.eq(wr_data)
1102
1103             # Cache hit reads
1104             comb += do_read.eq(1)
1105             comb += rd_addr.eq(early_req_row)
1106             with m.If(r1.hit_way == i):
1107                 comb += cache_out_row.eq(_d_out)
1108
1109             # Write mux:
1110             #
1111             # Defaults to wishbone read responses (cache refill)
1112             #
1113             # For timing, the mux on wr_data/sel/addr is not
1114             # dependent on anything other than the current state.
1115
1116             with m.If(r1.write_bram):
1117                 # Write store data to BRAM.  This happens one
1118                 # cycle after the store is in r0.
1119                 comb += wr_data.eq(r1.req.data)
1120                 comb += wr_sel.eq(r1.req.byte_sel)
1121                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1122
1123                 with m.If(i == r1.req.hit_way):
1124                     comb += do_write.eq(1)
1125             with m.Else():
1126                 # Otherwise, we might be doing a reload or a DCBZ
1127                 with m.If(r1.dcbz):
1128                     comb += wr_data.eq(0)
1129                 with m.Else():
1130                     comb += wr_data.eq(wb_in.dat)
1131                 comb += wr_addr.eq(r1.store_row)
1132                 comb += wr_sel.eq(~0) # all 1s
1133
1134                 with m.If((r1.state == State.RELOAD_WAIT_ACK)
1135                           & wb_in.ack & (replace_way == i)):
1136                     comb += do_write.eq(1)
1137
1138             # Mask write selects with do_write since BRAM
1139             # doesn't have a global write-enable
1140             with m.If(do_write):
1141                 comb += wr_sel_m.eq(wr_sel)
1142
1143     # Cache hit synchronous machine for the easy case.
1144     # This handles load hits.
1145     # It also handles error cases (TLB miss, cache paradox)
1146     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1147                         req_hit_way, req_index, req_tag, access_ok,
1148                         tlb_hit, tlb_hit_way, tlb_req_index):
1149
1150         comb = m.d.comb
1151         sync = m.d.sync
1152
1153         with m.If(req_op != Op.OP_NONE):
1154             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1155                     req_op, r0.req.addr, r0.req.nc,
1156                     req_index, req_tag, req_hit_way)
1157
1158         with m.If(r0_valid):
1159             sync += r1.mmu_req.eq(r0.mmu_req)
1160
1161         # Fast path for load/store hits.
1162         # Set signals for the writeback controls.
1163         sync += r1.hit_way.eq(req_hit_way)
1164         sync += r1.hit_index.eq(req_index)
1165
1166         with m.If(req_op == Op.OP_LOAD_HIT):
1167             sync += r1.hit_load_valid.eq(1)
1168         with m.Else():
1169             sync += r1.hit_load_valid.eq(0)
1170
1171         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1172             sync += r1.cache_hit.eq(1)
1173         with m.Else():
1174             sync += r1.cache_hit.eq(0)
1175
1176         with m.If(req_op == Op.OP_BAD):
1177             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1178             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1179             sync += r1.ls_error.eq(~r0.mmu_req)
1180             sync += r1.mmu_error.eq(r0.mmu_req)
1181             sync += r1.cache_paradox.eq(access_ok)
1182
1183             with m.Else():
1184                 sync += r1.ls_error.eq(0)
1185                 sync += r1.mmu_error.eq(0)
1186                 sync += r1.cache_paradox.eq(0)
1187
1188         with m.If(req_op == Op.OP_STCX_FAIL):
1189             sync += r1.stcx_fail.eq(1)
1190         with m.Else():
1191             sync += r1.stcx_fail.eq(0)
1192
1193         # Record TLB hit information for updating TLB PLRU
1194         sync += r1.tlb_hit.eq(tlb_hit)
1195         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1196         sync += r1.tlb_hit_index.eq(tlb_req_index)
1197
1198     # Memory accesses are handled by this state machine:
1199     #
1200     #   * Cache load miss/reload (in conjunction with "rams")
1201     #   * Load hits for non-cachable forms
1202     #   * Stores (the collision case is handled in "rams")
1203     #
1204     # All wishbone requests generation is done here.
1205     # This machine operates at stage 1.
1206     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1207                     cache_valids, r0, replace_way,
1208                     req_hit_way, req_same_tag,
1209                     r0_valid, req_op, cache_tags, req_go, ra):
1210
1211         comb = m.d.comb
1212         sync = m.d.sync
1213         wb_in = self.wb_in
1214         d_in = self.d_in
1215
1216         req         = MemAccessRequest("mreq_ds")
1217
1218         req_row = Signal(ROW_BITS)
1219         req_idx = Signal(INDEX_BITS)
1220         req_tag = Signal(TAG_BITS)
1221         comb += req_idx.eq(get_index(req.real_addr))
1222         comb += req_row.eq(get_row(req.real_addr))
1223         comb += req_tag.eq(get_tag(req.real_addr))
1224
1225         sync += r1.use_forward1.eq(use_forward1_next)
1226         sync += r1.forward_sel.eq(0)
1227
1228         with m.If(use_forward1_next):
1229             sync += r1.forward_sel.eq(r1.req.byte_sel)
1230         with m.Elif(use_forward2_next):
1231             sync += r1.forward_sel.eq(r1.forward_sel1)
1232
1233         sync += r1.forward_data2.eq(r1.forward_data1)
1234         with m.If(r1.write_bram):
1235             sync += r1.forward_data1.eq(r1.req.data)
1236             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1237             sync += r1.forward_way1.eq(r1.req.hit_way)
1238             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1239             sync += r1.forward_valid1.eq(1)
1240         with m.Else():
1241             with m.If(r1.dcbz):
1242                 sync += r1.forward_data1.eq(0)
1243             with m.Else():
1244                 sync += r1.forward_data1.eq(wb_in.dat)
1245             sync += r1.forward_sel1.eq(~0) # all 1s
1246             sync += r1.forward_way1.eq(replace_way)
1247             sync += r1.forward_row1.eq(r1.store_row)
1248             sync += r1.forward_valid1.eq(0)
1249
1250         # One cycle pulses reset
1251         sync += r1.slow_valid.eq(0)
1252         sync += r1.write_bram.eq(0)
1253         sync += r1.inc_acks.eq(0)
1254         sync += r1.dec_acks.eq(0)
1255
1256         sync += r1.ls_valid.eq(0)
1257         # complete tlbies and TLB loads in the third cycle
1258         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1259
1260         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1261             with m.If(~r0.mmu_req):
1262                 sync += r1.ls_valid.eq(1)
1263             with m.Else():
1264                 sync += r1.mmu_done.eq(1)
1265
1266         with m.If(r1.write_tag):
1267             # Store new tag in selected way
1268             for i in range(NUM_WAYS):
1269                 with m.If(i == replace_way):
1270                     ct = Signal(TAG_RAM_WIDTH)
1271                     comb += ct.eq(cache_tags[r1.store_index])
1272                     """
1273 TODO: check this
1274 cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
1275                     (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
1276                     """
1277                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1278                     sync += cache_tags[r1.store_index].eq(ct)
1279             sync += r1.store_way.eq(replace_way)
1280             sync += r1.write_tag.eq(0)
1281
1282         # Take request from r1.req if there is one there,
1283         # else from req_op, ra, etc.
1284         with m.If(r1.full):
1285             comb += req.eq(r1.req)
1286         with m.Else():
1287             comb += req.op.eq(req_op)
1288             comb += req.valid.eq(req_go)
1289             comb += req.mmu_req.eq(r0.mmu_req)
1290             comb += req.dcbz.eq(r0.req.dcbz)
1291             comb += req.real_addr.eq(ra)
1292
1293             with m.If(r0.req.dcbz):
1294                 # force data to 0 for dcbz
1295                 comb += req.data.eq(0)
1296             with m.Elif(r0.d_valid):
1297                 comb += req.data.eq(r0.req.data)
1298             with m.Else():
1299                 comb += req.data.eq(d_in.data)
1300
1301             # Select all bytes for dcbz
1302             # and for cacheable loads
1303             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1304                 comb += req.byte_sel.eq(~0) # all 1s
1305             with m.Else():
1306                 comb += req.byte_sel.eq(r0.req.byte_sel)
1307             comb += req.hit_way.eq(req_hit_way)
1308             comb += req.same_tag.eq(req_same_tag)
1309
1310             # Store the incoming request from r0,
1311             # if it is a slow request
1312             # Note that r1.full = 1 implies req_op = OP_NONE
1313             with m.If((req_op == Op.OP_LOAD_MISS)
1314                       | (req_op == Op.OP_LOAD_NC)
1315                       | (req_op == Op.OP_STORE_MISS)
1316                       | (req_op == Op.OP_STORE_HIT)):
1317                 sync += r1.req.eq(req)
1318                 sync += r1.full.eq(1)
1319
1320         # Main state machine
1321         with m.Switch(r1.state):
1322
1323             with m.Case(State.IDLE):
1324                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1325                 sync += r1.wb.sel.eq(req.byte_sel)
1326                 sync += r1.wb.dat.eq(req.data)
1327                 sync += r1.dcbz.eq(req.dcbz)
1328
1329                 # Keep track of our index and way
1330                 # for subsequent stores.
1331                 sync += r1.store_index.eq(req_idx)
1332                 sync += r1.store_row.eq(req_row)
1333                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1334                 sync += r1.reload_tag.eq(req_tag)
1335                 sync += r1.req.same_tag.eq(1)
1336
1337                 with m.If(req.op == Op.OP_STORE_HIT):
1338                     sync += r1.store_way.eq(req.hit_way)
1339
1340                 # Reset per-row valid bits,
1341                 # ready for handling OP_LOAD_MISS
1342                 for i in range(ROW_PER_LINE):
1343                     sync += r1.rows_valid[i].eq(0)
1344
1345                 with m.If(req_op != Op.OP_NONE):
1346                     sync += Display("cache op %d", req.op)
1347
1348                 with m.Switch(req.op):
1349                     with m.Case(Op.OP_LOAD_HIT):
1350                         # stay in IDLE state
1351                         pass
1352
1353                     with m.Case(Op.OP_LOAD_MISS):
1354                         sync += Display("cache miss real addr: %x " \
1355                                 "idx: %x tag: %x",
1356                                 req.real_addr, req_row, req_tag)
1357
1358                         # Start the wishbone cycle
1359                         sync += r1.wb.we.eq(0)
1360                         sync += r1.wb.cyc.eq(1)
1361                         sync += r1.wb.stb.eq(1)
1362
1363                         # Track that we had one request sent
1364                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1365                         sync += r1.write_tag.eq(1)
1366
1367                     with m.Case(Op.OP_LOAD_NC):
1368                         sync += r1.wb.cyc.eq(1)
1369                         sync += r1.wb.stb.eq(1)
1370                         sync += r1.wb.we.eq(0)
1371                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1372
1373                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1374                         with m.If(~req.dcbz):
1375                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1376                             sync += r1.acks_pending.eq(1)
1377                             sync += r1.full.eq(0)
1378                             sync += r1.slow_valid.eq(1)
1379
1380                             with m.If(~req.mmu_req):
1381                                 sync += r1.ls_valid.eq(1)
1382                             with m.Else():
1383                                 sync += r1.mmu_done.eq(1)
1384
1385                             with m.If(req.op == Op.OP_STORE_HIT):
1386                                 sync += r1.write_bram.eq(1)
1387                         with m.Else():
1388                             # dcbz is handled much like a load miss except
1389                             # that we are writing to memory instead of reading
1390                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1391
1392                             with m.If(req.op == Op.OP_STORE_MISS):
1393                                 sync += r1.write_tag.eq(1)
1394
1395                         sync += r1.wb.we.eq(1)
1396                         sync += r1.wb.cyc.eq(1)
1397                         sync += r1.wb.stb.eq(1)
1398
1399                     # OP_NONE and OP_BAD do nothing
1400                     # OP_BAD & OP_STCX_FAIL were
1401                     # handled above already
1402                     with m.Case(Op.OP_NONE):
1403                         pass
1404                     with m.Case(Op.OP_BAD):
1405                         pass
1406                     with m.Case(Op.OP_STCX_FAIL):
1407                         pass
1408
1409             with m.Case(State.RELOAD_WAIT_ACK):
1410                 ld_stbs_done = Signal()
1411                 # Requests are all sent if stb is 0
1412                 comb += ld_stbs_done.eq(~r1.wb.stb)
1413
1414                 # If we are still sending requests, was one accepted?
1415                 with m.If((~wb_in.stall) & r1.wb.stb):
1416                     # That was the last word?  We are done sending.
1417                     # Clear stb and set ld_stbs_done so we can handle an
1418                     # eventual last ack on the same cycle.
1419                     # sigh - reconstruct wb adr with 3 extra 0s at front
1420                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1421                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1422                         sync += r1.wb.stb.eq(0)
1423                         comb += ld_stbs_done.eq(1)
1424
1425                     # Calculate the next row address in the current cache line
1426                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1427                     comb += row.eq(r1.wb.adr)
1428                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1429
1430                 # Incoming acks processing
1431                 sync += r1.forward_valid1.eq(wb_in.ack)
1432                 with m.If(wb_in.ack):
1433                     srow = Signal(ROW_LINE_BITS)
1434                     comb += srow.eq(r1.store_row)
1435                     sync += r1.rows_valid[srow].eq(1)
1436
1437                     # If this is the data we were looking for,
1438                     # we can complete the request next cycle.
1439                     # Compare the whole address in case the
1440                     # request in r1.req is not the one that
1441                     # started this refill.
1442                     with m.If(req.valid & r1.req.same_tag &
1443                               ((r1.dcbz & r1.req.dcbz) |
1444                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1445                                 (r1.store_row == get_row(req.real_addr))):
1446                         sync += r1.full.eq(0)
1447                         sync += r1.slow_valid.eq(1)
1448                         with m.If(~r1.mmu_req):
1449                             sync += r1.ls_valid.eq(1)
1450                         with m.Else():
1451                             sync += r1.mmu_done.eq(1)
1452                         sync += r1.forward_sel.eq(~0) # all 1s
1453                         sync += r1.use_forward1.eq(1)
1454
1455                     # Check for completion
1456                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1457                                                       r1.end_row_ix)):
1458                         # Complete wishbone cycle
1459                         sync += r1.wb.cyc.eq(0)
1460
1461                         # Cache line is now valid
1462                         cv = Signal(INDEX_BITS)
1463                         comb += cv.eq(cache_valids[r1.store_index])
1464                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1465                         sync += cache_valids[r1.store_index].eq(cv)
1466
1467                         sync += r1.state.eq(State.IDLE)
1468
1469                     # Increment store row counter
1470                     sync += r1.store_row.eq(next_row(r1.store_row))
1471
1472             with m.Case(State.STORE_WAIT_ACK):
1473                 st_stbs_done = Signal()
1474                 acks        = Signal(3)
1475                 adjust_acks = Signal(3)
1476
1477                 comb += st_stbs_done.eq(~r1.wb.stb)
1478                 comb += acks.eq(r1.acks_pending)
1479
1480                 with m.If(r1.inc_acks != r1.dec_acks):
1481                     with m.If(r1.inc_acks):
1482                         comb += adjust_acks.eq(acks + 1)
1483                     with m.Else():
1484                         comb += adjust_acks.eq(acks - 1)
1485                 with m.Else():
1486                     comb += adjust_acks.eq(acks)
1487
1488                 sync += r1.acks_pending.eq(adjust_acks)
1489
1490                 # Clear stb when slave accepted request
1491                 with m.If(~wb_in.stall):
1492                     # See if there is another store waiting
1493                     # to be done which is in the same real page.
1494                     with m.If(req.valid):
1495                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1496                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1497                         sync += r1.wb.dat.eq(req.data)
1498                         sync += r1.wb.sel.eq(req.byte_sel)
1499
1500                     with m.If((adjust_acks < 7) & req.same_tag &
1501                                 ((req.op == Op.OP_STORE_MISS)
1502                                  | (req.op == Op.OP_STORE_HIT))):
1503                         sync += r1.wb.stb.eq(1)
1504                         comb += st_stbs_done.eq(0)
1505
1506                         with m.If(req.op == Op.OP_STORE_HIT):
1507                             sync += r1.write_bram.eq(1)
1508                         sync += r1.full.eq(0)
1509                         sync += r1.slow_valid.eq(1)
1510
1511                         # Store requests never come from the MMU
1512                         sync += r1.ls_valid.eq(1)
1513                         comb += st_stbs_done.eq(0)
1514                         sync += r1.inc_acks.eq(1)
1515                     with m.Else():
1516                         sync += r1.wb.stb.eq(0)
1517                         comb += st_stbs_done.eq(1)
1518
1519                 # Got ack ? See if complete.
1520                 with m.If(wb_in.ack):
1521                     with m.If(st_stbs_done & (adjust_acks == 1)):
1522                         sync += r1.state.eq(State.IDLE)
1523                         sync += r1.wb.cyc.eq(0)
1524                         sync += r1.wb.stb.eq(0)
1525                     sync += r1.dec_acks.eq(1)
1526
1527             with m.Case(State.NC_LOAD_WAIT_ACK):
1528                 # Clear stb when slave accepted request
1529                 with m.If(~wb_in.stall):
1530                     sync += r1.wb.stb.eq(0)
1531
1532                 # Got ack ? complete.
1533                 with m.If(wb_in.ack):
1534                     sync += r1.state.eq(State.IDLE)
1535                     sync += r1.full.eq(0)
1536                     sync += r1.slow_valid.eq(1)
1537
1538                     with m.If(~r1.mmu_req):
1539                         sync += r1.ls_valid.eq(1)
1540                     with m.Else():
1541                         sync += r1.mmu_done.eq(1)
1542
1543                     sync += r1.forward_sel.eq(~0) # all 1s
1544                     sync += r1.use_forward1.eq(1)
1545                     sync += r1.wb.cyc.eq(0)
1546                     sync += r1.wb.stb.eq(0)
1547
1548     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1549
1550         sync = m.d.sync
1551         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1552
1553         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1554                                stall_out, req_op[:3], d_out.valid, d_out.error,
1555                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1556                                r1.real_adr[3:6]))
1557
1558     def elaborate(self, platform):
1559
1560         m = Module()
1561         comb = m.d.comb
1562         d_in = self.d_in
1563
1564         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1565         cache_tags       = CacheTagArray()
1566         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1567         cache_valids = CacheValidBitsArray()
1568
1569         # TODO attribute ram_style : string;
1570         # TODO attribute ram_style of cache_tags : signal is "distributed";
1571
1572         """note: these are passed to nmigen.hdl.Memory as "attributes".
1573            don't know how, just that they are.
1574         """
1575         dtlb_valid_bits = TLBValidBitsArray()
1576         dtlb_tags       = TLBTagsArray()
1577         dtlb_ptes       = TLBPtesArray()
1578         # TODO attribute ram_style of
1579         #  dtlb_tags : signal is "distributed";
1580         # TODO attribute ram_style of
1581         #  dtlb_ptes : signal is "distributed";
1582
1583         r0      = RegStage0("r0")
1584         r0_full = Signal()
1585
1586         r1 = RegStage1("r1")
1587
1588         reservation = Reservation()
1589
1590         # Async signals on incoming request
1591         req_index    = Signal(INDEX_BITS)
1592         req_row      = Signal(ROW_BITS)
1593         req_hit_way  = Signal(WAY_BITS)
1594         req_tag      = Signal(TAG_BITS)
1595         req_op       = Signal(Op)
1596         req_data     = Signal(64)
1597         req_same_tag = Signal()
1598         req_go       = Signal()
1599
1600         early_req_row     = Signal(ROW_BITS)
1601
1602         cancel_store      = Signal()
1603         set_rsrv          = Signal()
1604         clear_rsrv        = Signal()
1605
1606         r0_valid          = Signal()
1607         r0_stall          = Signal()
1608
1609         use_forward1_next = Signal()
1610         use_forward2_next = Signal()
1611
1612         cache_out_row     = Signal(WB_DATA_BITS)
1613
1614         plru_victim       = PLRUOut()
1615         replace_way       = Signal(WAY_BITS)
1616
1617         # Wishbone read/write/cache write formatting signals
1618         bus_sel           = Signal(8)
1619
1620         # TLB signals
1621         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1622         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1623         tlb_valid_way = Signal(TLB_NUM_WAYS)
1624         tlb_req_index = Signal(TLB_SET_BITS)
1625         tlb_hit       = Signal()
1626         tlb_hit_way   = Signal(TLB_WAY_BITS)
1627         pte           = Signal(TLB_PTE_BITS)
1628         ra            = Signal(REAL_ADDR_BITS)
1629         valid_ra      = Signal()
1630         perm_attr     = PermAttr("dc_perms")
1631         rc_ok         = Signal()
1632         perm_ok       = Signal()
1633         access_ok     = Signal()
1634
1635         tlb_plru_victim = TLBPLRUOut()
1636
1637         # we don't yet handle collisions between loadstore1 requests
1638         # and MMU requests
1639         comb += self.m_out.stall.eq(0)
1640
1641         # Hold off the request in r0 when r1 has an uncompleted request
1642         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1643         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1644         comb += self.stall_out.eq(r0_stall)
1645
1646         # Wire up wishbone request latch out of stage 1
1647         comb += self.wb_out.eq(r1.wb)
1648
1649         # deal with litex not doing wishbone pipeline mode
1650         # XXX in wrong way.  FIFOs are needed in the SRAM test
1651         # so that stb/ack match up
1652         #comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
1653
1654         # call sub-functions putting everything together, using shared
1655         # signals established above
1656         self.stage_0(m, r0, r1, r0_full)
1657         self.tlb_read(m, r0_stall, tlb_valid_way,
1658                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1659                       dtlb_tags, dtlb_ptes)
1660         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1661                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1662                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1663         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1664                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1665                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1666         self.maybe_plrus(m, r1, plru_victim)
1667         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1668         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1669         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1670                            r0_valid, r1, cache_valids, replace_way,
1671                            use_forward1_next, use_forward2_next,
1672                            req_hit_way, plru_victim, rc_ok, perm_attr,
1673                            valid_ra, perm_ok, access_ok, req_op, req_go,
1674                            tlb_pte_way,
1675                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1676                            cancel_store, req_same_tag, r0_stall, early_req_row)
1677         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1678                            r0_valid, r0, reservation)
1679         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1680                            reservation, r0)
1681         self.writeback_control(m, r1, cache_out_row)
1682         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1683         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1684                         req_hit_way, req_index, req_tag, access_ok,
1685                         tlb_hit, tlb_hit_way, tlb_req_index)
1686         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1687                     cache_valids, r0, replace_way,
1688                     req_hit_way, req_same_tag,
1689                          r0_valid, req_op, cache_tags, req_go, ra)
1690         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1691
1692         return m
1693
1694 def dcache_load(dut, addr, nc=0):
1695     yield dut.d_in.load.eq(1)
1696     yield dut.d_in.nc.eq(nc)
1697     yield dut.d_in.addr.eq(addr)
1698     yield dut.d_in.byte_sel.eq(~0)
1699     yield dut.d_in.valid.eq(1)
1700     yield
1701     yield dut.d_in.valid.eq(0)
1702     yield dut.d_in.byte_sel.eq(0)
1703     while not (yield dut.d_out.valid):
1704         yield
1705     # yield # data is valid one cycle AFTER valid goes hi? (no it isn't)
1706     data = yield dut.d_out.data
1707     return data
1708
1709
1710 def dcache_store(dut, addr, data, nc=0):
1711     yield dut.d_in.load.eq(0)
1712     yield dut.d_in.nc.eq(nc)
1713     yield dut.d_in.data.eq(data)
1714     yield dut.d_in.byte_sel.eq(~0)
1715     yield dut.d_in.addr.eq(addr)
1716     yield dut.d_in.valid.eq(1)
1717     yield
1718     yield dut.d_in.valid.eq(0)
1719     yield dut.d_in.byte_sel.eq(0)
1720     while not (yield dut.d_out.valid):
1721         yield
1722
1723
1724 def dcache_random_sim(dut, mem):
1725
1726     # start copy of mem
1727     sim_mem = deepcopy(mem)
1728     memsize = len(sim_mem)
1729     print ("mem len", memsize)
1730
1731     # clear stuff
1732     yield dut.d_in.valid.eq(0)
1733     yield dut.d_in.load.eq(0)
1734     yield dut.d_in.priv_mode.eq(1)
1735     yield dut.d_in.nc.eq(0)
1736     yield dut.d_in.addr.eq(0)
1737     yield dut.d_in.data.eq(0)
1738     yield dut.m_in.valid.eq(0)
1739     yield dut.m_in.addr.eq(0)
1740     yield dut.m_in.pte.eq(0)
1741     # wait 4 * clk_period
1742     yield
1743     yield
1744     yield
1745     yield
1746
1747     print ()
1748
1749     #for i in range(1024):
1750     #    sim_mem[i] = i
1751
1752     for i in range(1024):
1753         addr = randint(0, memsize-1)
1754         data = randint(0, (1<<64)-1)
1755         sim_mem[addr] = data
1756         row = addr
1757         addr *= 8
1758
1759         print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
1760
1761         yield from dcache_load(dut, addr)
1762         yield from dcache_store(dut, addr, data)
1763
1764         addr = randint(0, memsize-1)
1765         sim_data = sim_mem[addr]
1766         row = addr
1767         addr *= 8
1768
1769         print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1770         data = yield from dcache_load(dut, addr)
1771         assert data == sim_data, \
1772             "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1773
1774     for addr in range(memsize):
1775         data = yield from dcache_load(dut, addr*8)
1776         assert data == sim_mem[addr], \
1777             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1778
1779 def dcache_regression_sim(dut, mem):
1780
1781     # start copy of mem
1782     sim_mem = deepcopy(mem)
1783     memsize = len(sim_mem)
1784     print ("mem len", memsize)
1785
1786     # clear stuff
1787     yield dut.d_in.valid.eq(0)
1788     yield dut.d_in.load.eq(0)
1789     yield dut.d_in.priv_mode.eq(1)
1790     yield dut.d_in.nc.eq(0)
1791     yield dut.d_in.addr.eq(0)
1792     yield dut.d_in.data.eq(0)
1793     yield dut.m_in.valid.eq(0)
1794     yield dut.m_in.addr.eq(0)
1795     yield dut.m_in.pte.eq(0)
1796     # wait 4 * clk_period
1797     yield
1798     yield
1799     yield
1800     yield
1801
1802     addr = 0
1803     row = addr
1804     addr *= 8
1805
1806     print ("random testing %d 0x%x row %d" % (i, addr, row))
1807
1808     yield from dcache_load(dut, addr)
1809
1810     addr = 2
1811     sim_data = sim_mem[addr]
1812     row = addr
1813     addr *= 8
1814
1815     print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1816     data = yield from dcache_load(dut, addr)
1817     assert data == sim_data, \
1818         "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1819
1820
1821
1822 def dcache_sim(dut, mem):
1823     # clear stuff
1824     yield dut.d_in.valid.eq(0)
1825     yield dut.d_in.load.eq(0)
1826     yield dut.d_in.priv_mode.eq(1)
1827     yield dut.d_in.nc.eq(0)
1828     yield dut.d_in.addr.eq(0)
1829     yield dut.d_in.data.eq(0)
1830     yield dut.m_in.valid.eq(0)
1831     yield dut.m_in.addr.eq(0)
1832     yield dut.m_in.pte.eq(0)
1833     # wait 4 * clk_period
1834     yield
1835     yield
1836     yield
1837     yield
1838
1839     # Cacheable read of address 4
1840     data = yield from dcache_load(dut, 0x58)
1841     addr = yield dut.d_in.addr
1842     assert data == 0x0000001700000016, \
1843         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1844
1845     # Cacheable read of address 20
1846     data = yield from dcache_load(dut, 0x20)
1847     addr = yield dut.d_in.addr
1848     assert data == 0x0000000900000008, \
1849         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1850
1851     # Cacheable read of address 30
1852     data = yield from dcache_load(dut, 0x530)
1853     addr = yield dut.d_in.addr
1854     assert data == 0x0000014D0000014C, \
1855         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1856
1857     # 2nd Cacheable read of address 30
1858     data = yield from dcache_load(dut, 0x530)
1859     addr = yield dut.d_in.addr
1860     assert data == 0x0000014D0000014C, \
1861         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1862
1863     # Non-cacheable read of address 100
1864     data = yield from dcache_load(dut, 0x100, nc=1)
1865     addr = yield dut.d_in.addr
1866     assert data == 0x0000004100000040, \
1867         f"data @%x=%x expected 0000004100000040" % (addr, data)
1868
1869     # Store at address 530
1870     yield from dcache_store(dut, 0x530, 0x121)
1871
1872     # Store at address 30
1873     yield from dcache_store(dut, 0x530, 0x12345678)
1874
1875     # 3nd Cacheable read of address 530
1876     data = yield from dcache_load(dut, 0x530)
1877     addr = yield dut.d_in.addr
1878     assert data == 0x12345678, \
1879         f"data @%x=%x expected 0x12345678" % (addr, data)
1880
1881     # 4th Cacheable read of address 20
1882     data = yield from dcache_load(dut, 0x20)
1883     addr = yield dut.d_in.addr
1884     assert data == 0x0000000900000008, \
1885         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1886
1887     yield
1888     yield
1889     yield
1890     yield
1891
1892
1893 def test_dcache(mem, test_fn, test_name):
1894     dut = DCache()
1895
1896     memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
1897     sram = SRAM(memory=memory, granularity=8)
1898
1899     m = Module()
1900     m.submodules.dcache = dut
1901     m.submodules.sram = sram
1902
1903     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1904     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1905     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1906     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1907     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1908     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1909
1910     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1911     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1912
1913     dcache_write_gtkw(test_name)
1914
1915     # nmigen Simulation
1916     sim = Simulator(m)
1917     sim.add_clock(1e-6)
1918
1919     sim.add_sync_process(wrap(test_fn(dut, mem)))
1920     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1921         sim.run()
1922
1923
1924 def dcache_write_gtkw(test_name):
1925     traces = [
1926         'clk',
1927         ('d_in', [
1928             'd_in_load', 'd_in_nc', 'd_in_addr[63:0]', 'd_in_data[63:0]',
1929             'd_in_byte_sel[7:0]', 'd_in_valid'
1930         ]),
1931         ('d_out', [
1932             'd_out_valid', 'd_out_data[63:0]'
1933         ]),
1934         ('wb_out', [
1935             'wb_out_cyc', 'wb_out_stb', 'wb_out_we',
1936             'wb_out_adr[31:0]', 'wb_out_sel[7:0]', 'wb_out_dat[63:0]'
1937         ]),
1938         ('wb_in', [
1939             'wb_in_stall', 'wb_in_ack', 'wb_in_dat[63:0]'
1940         ])
1941     ]
1942     write_gtkw('test_dcache%s.gtkw' % test_name,
1943                'test_dcache%s.vcd' % test_name,
1944                traces, module='top.dcache')
1945
1946
1947 if __name__ == '__main__':
1948     seed(0)
1949     dut = DCache()
1950     vl = rtlil.convert(dut, ports=[])
1951     with open("test_dcache.il", "w") as f:
1952         f.write(vl)
1953
1954     mem = []
1955     memsize = 16
1956     for i in range(memsize):
1957         mem.append(i)
1958
1959     test_dcache(mem, dcache_regression_sim, "simpleregression")
1960
1961     mem = []
1962     memsize = 256
1963     for i in range(memsize):
1964         mem.append(i)
1965
1966     test_dcache(mem, dcache_random_sim, "random")
1967
1968     mem = []
1969     for i in range(1024):
1970         mem.append((i*2)| ((i*2+1)<<32))
1971
1972     test_dcache(mem, dcache_sim, "")
1973