src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8 """
   9
  10 import sys
  11 sys.setrecursionlimit(1000000)
  12
  13 from enum import Enum, unique
  14
  15 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  16 from nmutil.util import Display
  17
  18 from copy import deepcopy
  19 from random import randint, seed
  20
  21 from nmigen.cli import main
  22 from nmutil.iocontrol import RecordObject
  23 from nmigen.utils import log2_int
  24 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  25                                      DCacheToLoadStore1Type,
  26                                      MMUToDCacheType,
  27                                      DCacheToMMUType)
  28
  29 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  30                                 WBAddrType, WBDataType, WBSelType,
  31                                 WBMasterOut, WBSlaveOut,
  32                                 WBMasterOutVector, WBSlaveOutVector,
  33                                 WBIOMasterOut, WBIOSlaveOut)
  34
  35 from soc.experiment.cache_ram import CacheRam
  36 #from soc.experiment.plru import PLRU
  37 from nmutil.plru import PLRU
  38
  39 # for test
  40 from soc.bus.sram import SRAM
  41 from nmigen import Memory
  42 from nmigen.cli import rtlil
  43
  44 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  45 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  46 from nmutil.sim_tmp_alternative import Simulator
  47
  48 from nmutil.util import wrap
  49
  50
  51 # TODO: make these parameters of DCache at some point
  52 LINE_SIZE = 64    # Line size in bytes
  53 NUM_LINES = 16    # Number of lines in a set
  54 NUM_WAYS = 4      # Number of ways
  55 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  56 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  57 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  58 LOG_LENGTH = 0    # Non-zero to enable log data collection
  59
  60 # BRAM organisation: We never access more than
  61 #     -- WB_DATA_BITS at a time so to save
  62 #     -- resources we make the array only that wide, and
  63 #     -- use consecutive indices for to make a cache "line"
  64 #     --
  65 #     -- ROW_SIZE is the width in bytes of the BRAM
  66 #     -- (based on WB, so 64-bits)
  67 ROW_SIZE = WB_DATA_BITS // 8;
  68
  69 # ROW_PER_LINE is the number of row (wishbone
  70 # transactions) in a line
  71 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  72
  73 # BRAM_ROWS is the number of rows in BRAM needed
  74 # to represent the full dcache
  75 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  76
  77 print ("ROW_SIZE", ROW_SIZE)
  78 print ("ROW_PER_LINE", ROW_PER_LINE)
  79 print ("BRAM_ROWS", BRAM_ROWS)
  80 print ("NUM_WAYS", NUM_WAYS)
  81
  82 # Bit fields counts in the address
  83
  84 # REAL_ADDR_BITS is the number of real address
  85 # bits that we store
  86 REAL_ADDR_BITS = 56
  87
  88 # ROW_BITS is the number of bits to select a row
  89 ROW_BITS = log2_int(BRAM_ROWS)
  90
  91 # ROW_LINE_BITS is the number of bits to select
  92 # a row within a line
  93 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  94
  95 # LINE_OFF_BITS is the number of bits for
  96 # the offset in a cache line
  97 LINE_OFF_BITS = log2_int(LINE_SIZE)
  98
  99 # ROW_OFF_BITS is the number of bits for
 100 # the offset in a row
 101 ROW_OFF_BITS = log2_int(ROW_SIZE)
 102
 103 # INDEX_BITS is the number if bits to
 104 # select a cache line
 105 INDEX_BITS = log2_int(NUM_LINES)
 106
 107 # SET_SIZE_BITS is the log base 2 of the set size
 108 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 109
 110 # TAG_BITS is the number of bits of
 111 # the tag part of the address
 112 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 113
 114 # TAG_WIDTH is the width in bits of each way of the tag RAM
 115 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 116
 117 # WAY_BITS is the number of bits to select a way
 118 WAY_BITS = log2_int(NUM_WAYS)
 119
 120 # Example of layout for 32 lines of 64 bytes:
 121 layout = """\
 122   ..  tag    |index|  line  |
 123   ..         |   row   |    |
 124   ..         |     |---|    | ROW_LINE_BITS  (3)
 125   ..         |     |--- - --| LINE_OFF_BITS (6)
 126   ..         |         |- --| ROW_OFF_BITS  (3)
 127   ..         |----- ---|    | ROW_BITS      (8)
 128   ..         |-----|        | INDEX_BITS    (5)
 129   .. --------|              | TAG_BITS      (45)
 130 """
 131 print (layout)
 132 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 133             (TAG_BITS, INDEX_BITS, ROW_BITS,
 134              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 135 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 136 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 137 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 138
 139 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 140
 141 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 142
 143 def CacheTagArray():
 144     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 145                         for x in range(NUM_LINES))
 146
 147 def CacheValidBitsArray():
 148     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 149                         for x in range(NUM_LINES))
 150
 151 def RowPerLineValidArray():
 152     return Array(Signal(name="rows_valid%d" % x) \
 153                         for x in range(ROW_PER_LINE))
 154
 155 # L1 TLB
 156 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 157 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 158 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 159 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 160 TLB_PTE_BITS     = 64
 161 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 162
 163 def ispow2(x):
 164     return (1<<log2_int(x, False)) == x
 165
 166 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 167 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 168 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 169 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 170 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 171 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 172         "geometry bits don't add up"
 173 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 174         "geometry bits don't add up"
 175 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 176          "geometry bits don't add up"
 177 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 178 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 179
 180
 181 def TLBValidBitsArray():
 182     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 183                 for x in range(TLB_SET_SIZE))
 184
 185 def TLBTagEAArray():
 186     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 187                 for x in range (TLB_NUM_WAYS))
 188
 189 def TLBTagsArray():
 190     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 191                 for x in range (TLB_SET_SIZE))
 192
 193 def TLBPtesArray():
 194     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 195                 for x in range(TLB_SET_SIZE))
 196
 197 def HitWaySet():
 198     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 199                         for x in range(TLB_NUM_WAYS))
 200
 201 # Cache RAM interface
 202 def CacheRamOut():
 203     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 204                  for x in range(NUM_WAYS))
 205
 206 # PLRU output interface
 207 def PLRUOut():
 208     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 209                 for x in range(NUM_LINES))
 210
 211 # TLB PLRU output interface
 212 def TLBPLRUOut():
 213     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 214                 for x in range(TLB_SET_SIZE))
 215
 216 # Helper functions to decode incoming requests
 217 #
 218 # Return the cache line index (tag index) for an address
 219 def get_index(addr):
 220     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 221
 222 # Return the cache row index (data memory) for an address
 223 def get_row(addr):
 224     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 225
 226 # Return the index of a row within a line
 227 def get_row_of_line(row):
 228     return row[:ROW_BITS][:ROW_LINE_BITS]
 229
 230 # Returns whether this is the last row of a line
 231 def is_last_row_addr(addr, last):
 232     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 233
 234 # Returns whether this is the last row of a line
 235 def is_last_row(row, last):
 236     return get_row_of_line(row) == last
 237
 238 # Return the next row in the current cache line. We use a
 239 # dedicated function in order to limit the size of the
 240 # generated adder to be only the bits within a cache line
 241 # (3 bits with default settings)
 242 def next_row(row):
 243     row_v = row[0:ROW_LINE_BITS] + 1
 244     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 245
 246 # Get the tag value from the address
 247 def get_tag(addr):
 248     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 249
 250 # Read a tag from a tag memory row
 251 def read_tag(way, tagset):
 252     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 253
 254 # Read a TLB tag from a TLB tag memory row
 255 def read_tlb_tag(way, tags):
 256     return tags.word_select(way, TLB_EA_TAG_BITS)
 257
 258 # Write a TLB tag to a TLB tag memory row
 259 def write_tlb_tag(way, tags, tag):
 260     return read_tlb_tag(way, tags).eq(tag)
 261
 262 # Read a PTE from a TLB PTE memory row
 263 def read_tlb_pte(way, ptes):
 264     return ptes.word_select(way, TLB_PTE_BITS)
 265
 266 def write_tlb_pte(way, ptes, newpte):
 267     return read_tlb_pte(way, ptes).eq(newpte)
 268
 269
 270 # Record for storing permission, attribute, etc. bits from a PTE
 271 class PermAttr(RecordObject):
 272     def __init__(self, name=None):
 273         super().__init__(name=name)
 274         self.reference = Signal()
 275         self.changed   = Signal()
 276         self.nocache   = Signal()
 277         self.priv      = Signal()
 278         self.rd_perm   = Signal()
 279         self.wr_perm   = Signal()
 280
 281
 282 def extract_perm_attr(pte):
 283     pa = PermAttr()
 284     return pa;
 285
 286
 287 # Type of operation on a "valid" input
 288 @unique
 289 class Op(Enum):
 290     OP_NONE       = 0
 291     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 292     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 293     OP_LOAD_HIT   = 3 # Cache hit on load
 294     OP_LOAD_MISS  = 4 # Load missing cache
 295     OP_LOAD_NC    = 5 # Non-cachable load
 296     OP_STORE_HIT  = 6 # Store hitting cache
 297     OP_STORE_MISS = 7 # Store missing cache
 298
 299
 300 # Cache state machine
 301 @unique
 302 class State(Enum):
 303     IDLE             = 0 # Normal load hit processing
 304     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 305     STORE_WAIT_ACK   = 2 # Store wait ack
 306     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 307
 308
 309 # Dcache operations:
 310 #
 311 # In order to make timing, we use the BRAMs with
 312 # an output buffer, which means that the BRAM
 313 # output is delayed by an extra cycle.
 314 #
 315 # Thus, the dcache has a 2-stage internal pipeline
 316 # for cache hits with no stalls.
 317 #
 318 # All other operations are handled via stalling
 319 # in the first stage.
 320 #
 321 # The second stage can thus complete a hit at the same
 322 # time as the first stage emits a stall for a complex op.
 323 #
 324 # Stage 0 register, basically contains just the latched request
 325
 326 class RegStage0(RecordObject):
 327     def __init__(self, name=None):
 328         super().__init__(name=name)
 329         self.req     = LoadStore1ToDCacheType(name="lsmem")
 330         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 331         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 332         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 333         self.mmu_req = Signal() # indicates source of request
 334         self.d_valid = Signal() # indicates req.data is valid now
 335
 336
 337 class MemAccessRequest(RecordObject):
 338     def __init__(self, name=None):
 339         super().__init__(name=name)
 340         self.op        = Signal(Op)
 341         self.valid     = Signal()
 342         self.dcbz      = Signal()
 343         self.real_addr = Signal(REAL_ADDR_BITS)
 344         self.data      = Signal(64)
 345         self.byte_sel  = Signal(8)
 346         self.hit_way   = Signal(WAY_BITS)
 347         self.same_tag  = Signal()
 348         self.mmu_req   = Signal()
 349
 350
 351 # First stage register, contains state for stage 1 of load hits
 352 # and for the state machine used by all other operations
 353 class RegStage1(RecordObject):
 354     def __init__(self, name=None):
 355         super().__init__(name=name)
 356         # Info about the request
 357         self.full             = Signal() # have uncompleted request
 358         self.mmu_req          = Signal() # request is from MMU
 359         self.req              = MemAccessRequest(name="reqmem")
 360
 361         # Cache hit state
 362         self.hit_way          = Signal(WAY_BITS)
 363         self.hit_load_valid   = Signal()
 364         self.hit_index        = Signal(INDEX_BITS)
 365         self.cache_hit        = Signal()
 366
 367         # TLB hit state
 368         self.tlb_hit          = Signal()
 369         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 370         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 371
 372         # 2-stage data buffer for data forwarded from writes to reads
 373         self.forward_data1    = Signal(64)
 374         self.forward_data2    = Signal(64)
 375         self.forward_sel1     = Signal(8)
 376         self.forward_valid1   = Signal()
 377         self.forward_way1     = Signal(WAY_BITS)
 378         self.forward_row1     = Signal(ROW_BITS)
 379         self.use_forward1     = Signal()
 380         self.forward_sel      = Signal(8)
 381
 382         # Cache miss state (reload state machine)
 383         self.state            = Signal(State)
 384         self.dcbz             = Signal()
 385         self.write_bram       = Signal()
 386         self.write_tag        = Signal()
 387         self.slow_valid       = Signal()
 388         self.wb               = WBMasterOut("wb")
 389         self.reload_tag       = Signal(TAG_BITS)
 390         self.store_way        = Signal(WAY_BITS)
 391         self.store_row        = Signal(ROW_BITS)
 392         self.store_index      = Signal(INDEX_BITS)
 393         self.end_row_ix       = Signal(ROW_LINE_BITS)
 394         self.rows_valid       = RowPerLineValidArray()
 395         self.acks_pending     = Signal(3)
 396         self.inc_acks         = Signal()
 397         self.dec_acks         = Signal()
 398
 399         # Signals to complete (possibly with error)
 400         self.ls_valid         = Signal()
 401         self.ls_error         = Signal()
 402         self.mmu_done         = Signal()
 403         self.mmu_error        = Signal()
 404         self.cache_paradox    = Signal()
 405
 406         # Signal to complete a failed stcx.
 407         self.stcx_fail        = Signal()
 408
 409
 410 # Reservation information
 411 class Reservation(RecordObject):
 412     def __init__(self):
 413         super().__init__()
 414         self.valid = Signal()
 415         self.addr  = Signal(64-LINE_OFF_BITS)
 416
 417
 418 class DTLBUpdate(Elaboratable):
 419     def __init__(self):
 420         self.tlbie    = Signal()
 421         self.tlbwe    = Signal()
 422         self.doall    = Signal()
 423         self.updated  = Signal()
 424         self.v_updated  = Signal()
 425         self.tlb_hit    = Signal()
 426         self.tlb_req_index = Signal(TLB_SET_BITS)
 427
 428         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 429         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 430         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 431         self.repl_way        = Signal(TLB_WAY_BITS)
 432         self.eatag           = Signal(TLB_EA_TAG_BITS)
 433         self.pte_data        = Signal(TLB_PTE_BITS)
 434
 435         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 436
 437         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 438         self.pb_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 439         self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 440
 441     def elaborate(self, platform):
 442         m = Module()
 443         comb = m.d.comb
 444         sync = m.d.sync
 445
 446         tagset   = Signal(TLB_TAG_WAY_BITS)
 447         pteset   = Signal(TLB_PTE_WAY_BITS)
 448
 449         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 450         comb += db_out.eq(self.dv)
 451
 452         with m.If(self.tlbie & self.doall):
 453             pass # clear all back in parent
 454         with m.Elif(self.tlbie):
 455             with m.If(self.tlb_hit):
 456                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 457                 comb += self.v_updated.eq(1)
 458
 459         with m.Elif(self.tlbwe):
 460
 461             comb += tagset.eq(self.tlb_tag_way)
 462             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 463             comb += tb_out.eq(tagset)
 464
 465             comb += pteset.eq(self.tlb_pte_way)
 466             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 467             comb += pb_out.eq(pteset)
 468
 469             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 470
 471             comb += self.updated.eq(1)
 472             comb += self.v_updated.eq(1)
 473
 474         return m
 475
 476
 477 class DCachePendingHit(Elaboratable):
 478
 479     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 480                       cache_valid_idx, cache_tag_set,
 481                     req_addr,
 482                     hit_set):
 483
 484         self.go          = Signal()
 485         self.virt_mode   = Signal()
 486         self.is_hit      = Signal()
 487         self.tlb_hit     = Signal()
 488         self.hit_way     = Signal(WAY_BITS)
 489         self.rel_match   = Signal()
 490         self.req_index   = Signal(INDEX_BITS)
 491         self.reload_tag  = Signal(TAG_BITS)
 492
 493         self.tlb_hit_way = tlb_hit_way
 494         self.tlb_pte_way = tlb_pte_way
 495         self.tlb_valid_way = tlb_valid_way
 496         self.cache_valid_idx = cache_valid_idx
 497         self.cache_tag_set = cache_tag_set
 498         self.req_addr = req_addr
 499         self.hit_set = hit_set
 500
 501     def elaborate(self, platform):
 502         m = Module()
 503         comb = m.d.comb
 504         sync = m.d.sync
 505
 506         go = self.go
 507         virt_mode = self.virt_mode
 508         is_hit = self.is_hit
 509         tlb_pte_way = self.tlb_pte_way
 510         tlb_valid_way = self.tlb_valid_way
 511         cache_valid_idx = self.cache_valid_idx
 512         cache_tag_set = self.cache_tag_set
 513         req_addr = self.req_addr
 514         tlb_hit_way = self.tlb_hit_way
 515         tlb_hit = self.tlb_hit
 516         hit_set = self.hit_set
 517         hit_way = self.hit_way
 518         rel_match = self.rel_match
 519         req_index = self.req_index
 520         reload_tag = self.reload_tag
 521
 522         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 523                                     for i in range(TLB_NUM_WAYS))
 524         hit_way_set = HitWaySet()
 525
 526         # Test if pending request is a hit on any way
 527         # In order to make timing in virtual mode,
 528         # when we are using the TLB, we compare each
 529         # way with each of the real addresses from each way of
 530         # the TLB, and then decide later which match to use.
 531
 532         with m.If(virt_mode):
 533             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 534                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 535                 s_hit       = Signal()
 536                 s_pte       = Signal(TLB_PTE_BITS)
 537                 s_ra        = Signal(REAL_ADDR_BITS)
 538                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 539                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 540                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 541                 comb += s_tag.eq(get_tag(s_ra))
 542
 543                 for i in range(NUM_WAYS): # way_t
 544                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 545                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 546                                   (read_tag(i, cache_tag_set) == s_tag)
 547                                   & tlb_valid_way[j])
 548                     with m.If(is_tag_hit):
 549                         comb += hit_way_set[j].eq(i)
 550                         comb += s_hit.eq(1)
 551                 comb += hit_set[j].eq(s_hit)
 552                 with m.If(s_tag == reload_tag):
 553                     comb += rel_matches[j].eq(1)
 554             with m.If(tlb_hit):
 555                 comb += is_hit.eq(hit_set[tlb_hit_way])
 556                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 557                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 558         with m.Else():
 559             s_tag       = Signal(TAG_BITS)
 560             comb += s_tag.eq(get_tag(req_addr))
 561             for i in range(NUM_WAYS): # way_t
 562                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 563                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 564                           (read_tag(i, cache_tag_set) == s_tag))
 565                 with m.If(is_tag_hit):
 566                     comb += hit_way.eq(i)
 567                     comb += is_hit.eq(1)
 568             with m.If(s_tag == reload_tag):
 569                 comb += rel_match.eq(1)
 570
 571         return m
 572
 573
 574 class DCache(Elaboratable):
 575     """Set associative dcache write-through
 576     TODO (in no specific order):
 577     * See list in icache.vhdl
 578     * Complete load misses on the cycle when WB data comes instead of
 579       at the end of line (this requires dealing with requests coming in
 580       while not idle...)
 581     """
 582     def __init__(self):
 583         self.d_in      = LoadStore1ToDCacheType("d_in")
 584         self.d_out     = DCacheToLoadStore1Type("d_out")
 585
 586         self.m_in      = MMUToDCacheType("m_in")
 587         self.m_out     = DCacheToMMUType("m_out")
 588
 589         self.stall_out = Signal()
 590
 591         self.wb_out    = WBMasterOut()
 592         self.wb_in     = WBSlaveOut()
 593
 594         self.log_out   = Signal(20)
 595
 596     def stage_0(self, m, r0, r1, r0_full):
 597         """Latch the request in r0.req as long as we're not stalling
 598         """
 599         comb = m.d.comb
 600         sync = m.d.sync
 601         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 602
 603         r = RegStage0("stage0")
 604
 605         # TODO, this goes in unit tests and formal proofs
 606         with m.If(d_in.valid & m_in.valid):
 607             sync += Display("request collision loadstore vs MMU")
 608
 609         with m.If(m_in.valid):
 610             comb += r.req.valid.eq(1)
 611             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 612             comb += r.req.dcbz.eq(0)
 613             comb += r.req.nc.eq(0)
 614             comb += r.req.reserve.eq(0)
 615             comb += r.req.virt_mode.eq(0)
 616             comb += r.req.priv_mode.eq(1)
 617             comb += r.req.addr.eq(m_in.addr)
 618             comb += r.req.data.eq(m_in.pte)
 619             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 620             comb += r.tlbie.eq(m_in.tlbie)
 621             comb += r.doall.eq(m_in.doall)
 622             comb += r.tlbld.eq(m_in.tlbld)
 623             comb += r.mmu_req.eq(1)
 624         with m.Else():
 625             comb += r.req.eq(d_in)
 626             comb += r.req.data.eq(0)
 627             comb += r.tlbie.eq(0)
 628             comb += r.doall.eq(0)
 629             comb += r.tlbld.eq(0)
 630             comb += r.mmu_req.eq(0)
 631         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 632             sync += r0.eq(r)
 633             sync += r0_full.eq(r.req.valid)
 634             # Sample data the cycle after a request comes in from loadstore1.
 635             # If another request has come in already then the data will get
 636             # put directly into req.data below.
 637             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 638                      ~r0.mmu_req):
 639                 sync += r0.req.data.eq(d_in.data)
 640                 sync += r0.d_valid.eq(1)
 641
 642     def tlb_read(self, m, r0_stall, tlb_valid_way,
 643                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 644                  dtlb_tags, dtlb_ptes):
 645         """TLB
 646         Operates in the second cycle on the request latched in r0.req.
 647         TLB updates write the entry at the end of the second cycle.
 648         """
 649         comb = m.d.comb
 650         sync = m.d.sync
 651         m_in, d_in = self.m_in, self.d_in
 652
 653         index    = Signal(TLB_SET_BITS)
 654         addrbits = Signal(TLB_SET_BITS)
 655
 656         amin = TLB_LG_PGSZ
 657         amax = TLB_LG_PGSZ + TLB_SET_BITS
 658
 659         with m.If(m_in.valid):
 660             comb += addrbits.eq(m_in.addr[amin : amax])
 661         with m.Else():
 662             comb += addrbits.eq(d_in.addr[amin : amax])
 663         comb += index.eq(addrbits)
 664
 665         # If we have any op and the previous op isn't finished,
 666         # then keep the same output for next cycle.
 667         with m.If(~r0_stall):
 668             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 669             sync += tlb_tag_way.eq(dtlb_tags[index])
 670             sync += tlb_pte_way.eq(dtlb_ptes[index])
 671
 672     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 673         """Generate TLB PLRUs
 674         """
 675         comb = m.d.comb
 676         sync = m.d.sync
 677
 678         if TLB_NUM_WAYS == 0:
 679             return
 680         for i in range(TLB_SET_SIZE):
 681             # TLB PLRU interface
 682             tlb_plru        = PLRU(TLB_WAY_BITS)
 683             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 684             tlb_plru_acc_en = Signal()
 685
 686             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 687             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 688             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 689             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 690
 691     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 692                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 693                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 694
 695         comb = m.d.comb
 696
 697         hitway = Signal(TLB_WAY_BITS)
 698         hit    = Signal()
 699         eatag  = Signal(TLB_EA_TAG_BITS)
 700
 701         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 702         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 703         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 704
 705         for i in range(TLB_NUM_WAYS):
 706             is_tag_hit = Signal()
 707             comb += is_tag_hit.eq(tlb_valid_way[i]
 708                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 709             with m.If(is_tag_hit):
 710                 comb += hitway.eq(i)
 711                 comb += hit.eq(1)
 712
 713         comb += tlb_hit.eq(hit & r0_valid)
 714         comb += tlb_hit_way.eq(hitway)
 715
 716         with m.If(tlb_hit):
 717             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 718         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 719
 720         with m.If(r0.req.virt_mode):
 721             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 722                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 723                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 724             comb += perm_attr.reference.eq(pte[8])
 725             comb += perm_attr.changed.eq(pte[7])
 726             comb += perm_attr.nocache.eq(pte[5])
 727             comb += perm_attr.priv.eq(pte[3])
 728             comb += perm_attr.rd_perm.eq(pte[2])
 729             comb += perm_attr.wr_perm.eq(pte[1])
 730         with m.Else():
 731             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 732                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 733             comb += perm_attr.reference.eq(1)
 734             comb += perm_attr.changed.eq(1)
 735             comb += perm_attr.nocache.eq(0)
 736             comb += perm_attr.priv.eq(1)
 737             comb += perm_attr.rd_perm.eq(1)
 738             comb += perm_attr.wr_perm.eq(1)
 739
 740     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 741                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 742                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 743
 744         dtlb_valids = TLBValidBitsArray()
 745
 746         comb = m.d.comb
 747         sync = m.d.sync
 748
 749         tlbie    = Signal()
 750         tlbwe    = Signal()
 751
 752         comb += tlbie.eq(r0_valid & r0.tlbie)
 753         comb += tlbwe.eq(r0_valid & r0.tlbld)
 754
 755         m.submodules.tlb_update = d = DTLBUpdate()
 756         with m.If(tlbie & r0.doall):
 757             # clear all valid bits at once
 758             for i in range(TLB_SET_SIZE):
 759                 sync += dtlb_valid_bits[i].eq(0)
 760         with m.If(d.updated):
 761             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 762             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 763         with m.If(d.v_updated):
 764             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 765
 766         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 767
 768         comb += d.tlbie.eq(tlbie)
 769         comb += d.tlbwe.eq(tlbwe)
 770         comb += d.doall.eq(r0.doall)
 771         comb += d.tlb_hit.eq(tlb_hit)
 772         comb += d.tlb_hit_way.eq(tlb_hit_way)
 773         comb += d.tlb_tag_way.eq(tlb_tag_way)
 774         comb += d.tlb_pte_way.eq(tlb_pte_way)
 775         comb += d.tlb_req_index.eq(tlb_req_index)
 776
 777         with m.If(tlb_hit):
 778             comb += d.repl_way.eq(tlb_hit_way)
 779         with m.Else():
 780             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 781         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 782         comb += d.pte_data.eq(r0.req.data)
 783
 784     def maybe_plrus(self, m, r1, plru_victim):
 785         """Generate PLRUs
 786         """
 787         comb = m.d.comb
 788         sync = m.d.sync
 789
 790         if TLB_NUM_WAYS == 0:
 791             return
 792
 793         for i in range(NUM_LINES):
 794             # PLRU interface
 795             plru        = PLRU(WAY_BITS)
 796             setattr(m.submodules, "plru%d" % i, plru)
 797             plru_acc_en = Signal()
 798
 799             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 800             comb += plru.acc_en.eq(plru_acc_en)
 801             comb += plru.acc_i.eq(r1.hit_way)
 802             comb += plru_victim[i].eq(plru.lru_o)
 803
 804     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 805         """Cache tag RAM read port
 806         """
 807         comb = m.d.comb
 808         sync = m.d.sync
 809         m_in, d_in = self.m_in, self.d_in
 810
 811         index = Signal(INDEX_BITS)
 812
 813         with m.If(r0_stall):
 814             comb += index.eq(req_index)
 815         with m.Elif(m_in.valid):
 816             comb += index.eq(get_index(m_in.addr))
 817         with m.Else():
 818             comb += index.eq(get_index(d_in.addr))
 819         sync += cache_tag_set.eq(cache_tags[index])
 820
 821     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 822                        r0_valid, r1, cache_valids, replace_way,
 823                        use_forward1_next, use_forward2_next,
 824                        req_hit_way, plru_victim, rc_ok, perm_attr,
 825                        valid_ra, perm_ok, access_ok, req_op, req_go,
 826                        tlb_pte_way,
 827                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 828                        cancel_store, req_same_tag, r0_stall, early_req_row):
 829         """Cache request parsing and hit detection
 830         """
 831
 832         comb = m.d.comb
 833         m_in, d_in = self.m_in, self.d_in
 834
 835         is_hit      = Signal()
 836         hit_way     = Signal(WAY_BITS)
 837         op          = Signal(Op)
 838         opsel       = Signal(3)
 839         go          = Signal()
 840         nc          = Signal()
 841         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 842                                   for i in range(TLB_NUM_WAYS))
 843         cache_valid_idx = Signal(NUM_WAYS)
 844
 845         # Extract line, row and tag from request
 846         comb += req_index.eq(get_index(r0.req.addr))
 847         comb += req_row.eq(get_row(r0.req.addr))
 848         comb += req_tag.eq(get_tag(ra))
 849
 850         if False: # display on comb is a bit... busy.
 851             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 852                     r0.req.addr, ra, req_index, req_tag, req_row)
 853
 854         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 855         comb += cache_valid_idx.eq(cache_valids[req_index])
 856
 857         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 858                                 tlb_valid_way, tlb_hit_way,
 859                                 cache_valid_idx, cache_tag_set,
 860                                 r0.req.addr,
 861                                 hit_set)
 862
 863         comb += dc.tlb_hit.eq(tlb_hit)
 864         comb += dc.reload_tag.eq(r1.reload_tag)
 865         comb += dc.virt_mode.eq(r0.req.virt_mode)
 866         comb += dc.go.eq(go)
 867         comb += dc.req_index.eq(req_index)
 868         comb += is_hit.eq(dc.is_hit)
 869         comb += hit_way.eq(dc.hit_way)
 870         comb += req_same_tag.eq(dc.rel_match)
 871
 872         # See if the request matches the line currently being reloaded
 873         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 874                   (req_index == r1.store_index) & req_same_tag):
 875             # For a store, consider this a hit even if the row isn't
 876             # valid since it will be by the time we perform the store.
 877             # For a load, check the appropriate row valid bit.
 878             rrow = Signal(ROW_LINE_BITS)
 879             comb += rrow.eq(req_row)
 880             valid = r1.rows_valid[rrow]
 881             comb += is_hit.eq((~r0.req.load) | valid)
 882             comb += hit_way.eq(replace_way)
 883
 884         # Whether to use forwarded data for a load or not
 885         with m.If((get_row(r1.req.real_addr) == req_row) &
 886                   (r1.req.hit_way == hit_way)):
 887             # Only need to consider r1.write_bram here, since if we
 888             # are writing refill data here, then we don't have a
 889             # cache hit this cycle on the line being refilled.
 890             # (There is the possibility that the load following the
 891             # load miss that started the refill could be to the old
 892             # contents of the victim line, since it is a couple of
 893             # cycles after the refill starts before we see the updated
 894             # cache tag. In that case we don't use the bypass.)
 895             comb += use_forward1_next.eq(r1.write_bram)
 896         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 897             comb += use_forward2_next.eq(r1.forward_valid1)
 898
 899         # The way that matched on a hit
 900         comb += req_hit_way.eq(hit_way)
 901
 902         # The way to replace on a miss
 903         with m.If(r1.write_tag):
 904             comb += replace_way.eq(plru_victim[r1.store_index])
 905         with m.Else():
 906             comb += replace_way.eq(r1.store_way)
 907
 908         # work out whether we have permission for this access
 909         # NB we don't yet implement AMR, thus no KUAP
 910         comb += rc_ok.eq(perm_attr.reference
 911                          & (r0.req.load | perm_attr.changed))
 912         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 913                            (perm_attr.wr_perm |
 914                               (r0.req.load & perm_attr.rd_perm)))
 915         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 916         # Combine the request and cache hit status to decide what
 917         # operation needs to be done
 918         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 919         comb += op.eq(Op.OP_NONE)
 920         with m.If(go):
 921             with m.If(~access_ok):
 922                 comb += op.eq(Op.OP_BAD)
 923             with m.Elif(cancel_store):
 924                 comb += op.eq(Op.OP_STCX_FAIL)
 925             with m.Else():
 926                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 927                 with m.Switch(opsel):
 928                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 929                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 930                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 931                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 932                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 933                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 934                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 935                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 936         comb += req_op.eq(op)
 937         comb += req_go.eq(go)
 938
 939         # Version of the row number that is valid one cycle earlier
 940         # in the cases where we need to read the cache data BRAM.
 941         # If we're stalling then we need to keep reading the last
 942         # row requested.
 943         with m.If(~r0_stall):
 944             with m.If(m_in.valid):
 945                 comb += early_req_row.eq(get_row(m_in.addr))
 946             with m.Else():
 947                 comb += early_req_row.eq(get_row(d_in.addr))
 948         with m.Else():
 949             comb += early_req_row.eq(req_row)
 950
 951     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 952                          r0_valid, r0, reservation):
 953         """Handle load-with-reservation and store-conditional instructions
 954         """
 955         comb = m.d.comb
 956
 957         with m.If(r0_valid & r0.req.reserve):
 958             # XXX generate alignment interrupt if address
 959             # is not aligned XXX or if r0.req.nc = '1'
 960             with m.If(r0.req.load):
 961                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
 962             with m.Else():
 963                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
 964                 with m.If((~reservation.valid) |
 965                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 966                     comb += cancel_store.eq(1)
 967
 968     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 969                         reservation, r0):
 970
 971         comb = m.d.comb
 972         sync = m.d.sync
 973
 974         with m.If(r0_valid & access_ok):
 975             with m.If(clear_rsrv):
 976                 sync += reservation.valid.eq(0)
 977             with m.Elif(set_rsrv):
 978                 sync += reservation.valid.eq(1)
 979                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 980
 981     def writeback_control(self, m, r1, cache_out_row):
 982         """Return data for loads & completion control logic
 983         """
 984         comb = m.d.comb
 985         sync = m.d.sync
 986         d_out, m_out = self.d_out, self.m_out
 987
 988         data_out = Signal(64)
 989         data_fwd = Signal(64)
 990
 991         # Use the bypass if are reading the row that was
 992         # written 1 or 2 cycles ago, including for the
 993         # slow_valid = 1 case (i.e. completing a load
 994         # miss or a non-cacheable load).
 995         with m.If(r1.use_forward1):
 996             comb += data_fwd.eq(r1.forward_data1)
 997         with m.Else():
 998             comb += data_fwd.eq(r1.forward_data2)
 999
1000         comb += data_out.eq(cache_out_row)
1001
1002         for i in range(8):
1003             with m.If(r1.forward_sel[i]):
1004                 dsel = data_fwd.word_select(i, 8)
1005                 comb += data_out.word_select(i, 8).eq(dsel)
1006
1007         comb += d_out.valid.eq(r1.ls_valid)
1008         comb += d_out.data.eq(data_out)
1009         comb += d_out.store_done.eq(~r1.stcx_fail)
1010         comb += d_out.error.eq(r1.ls_error)
1011         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1012
1013         # Outputs to MMU
1014         comb += m_out.done.eq(r1.mmu_done)
1015         comb += m_out.err.eq(r1.mmu_error)
1016         comb += m_out.data.eq(data_out)
1017
1018         # We have a valid load or store hit or we just completed
1019         # a slow op such as a load miss, a NC load or a store
1020         #
1021         # Note: the load hit is delayed by one cycle. However it
1022         # can still not collide with r.slow_valid (well unless I
1023         # miscalculated) because slow_valid can only be set on a
1024         # subsequent request and not on its first cycle (the state
1025         # machine must have advanced), which makes slow_valid
1026         # at least 2 cycles from the previous hit_load_valid.
1027
1028         # Sanity: Only one of these must be set in any given cycle
1029
1030         if False: # TODO: need Display to get this to work
1031             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1032             "unexpected slow_valid collision with stcx_fail"
1033
1034             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1035              "unexpected hit_load_delayed collision with slow_valid"
1036
1037         with m.If(~r1.mmu_req):
1038             # Request came from loadstore1...
1039             # Load hit case is the standard path
1040             with m.If(r1.hit_load_valid):
1041                 sync += Display("completing load hit data=%x", data_out)
1042
1043             # error cases complete without stalling
1044             with m.If(r1.ls_error):
1045                 sync += Display("completing ld/st with error")
1046
1047             # Slow ops (load miss, NC, stores)
1048             with m.If(r1.slow_valid):
1049                 sync += Display("completing store or load miss adr=%x data=%x",
1050                                 r1.req.real_addr, data_out)
1051
1052         with m.Else():
1053             # Request came from MMU
1054             with m.If(r1.hit_load_valid):
1055                 sync += Display("completing load hit to MMU, data=%x",
1056                                 m_out.data)
1057             # error cases complete without stalling
1058             with m.If(r1.mmu_error):
1059                 sync += Display("combpleting MMU ld with error")
1060
1061             # Slow ops (i.e. load miss)
1062             with m.If(r1.slow_valid):
1063                 sync += Display("completing MMU load miss, data=%x",
1064                                 m_out.data)
1065
1066     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1067         """rams
1068         Generate a cache RAM for each way. This handles the normal
1069         reads, writes from reloads and the special store-hit update
1070         path as well.
1071
1072         Note: the BRAMs have an extra read buffer, meaning the output
1073         is pipelined an extra cycle. This differs from the
1074         icache. The writeback logic needs to take that into
1075         account by using 1-cycle delayed signals for load hits.
1076         """
1077         comb = m.d.comb
1078         wb_in = self.wb_in
1079
1080         for i in range(NUM_WAYS):
1081             do_read  = Signal(name="do_rd%d" % i)
1082             rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
1083             do_write = Signal(name="do_wr%d" % i)
1084             wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
1085             wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
1086             wr_sel   = Signal(ROW_SIZE)
1087             wr_sel_m = Signal(ROW_SIZE)
1088             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1089
1090             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
1091             setattr(m.submodules, "cacheram_%d" % i, way)
1092
1093             comb += way.rd_en.eq(do_read)
1094             comb += way.rd_addr.eq(rd_addr)
1095             comb += _d_out.eq(way.rd_data_o)
1096             comb += way.wr_sel.eq(wr_sel_m)
1097             comb += way.wr_addr.eq(wr_addr)
1098             comb += way.wr_data.eq(wr_data)
1099
1100             # Cache hit reads
1101             comb += do_read.eq(1)
1102             comb += rd_addr.eq(early_req_row)
1103             with m.If(r1.hit_way == i):
1104                 comb += cache_out_row.eq(_d_out)
1105
1106             # Write mux:
1107             #
1108             # Defaults to wishbone read responses (cache refill)
1109             #
1110             # For timing, the mux on wr_data/sel/addr is not
1111             # dependent on anything other than the current state.
1112
1113             with m.If(r1.write_bram):
1114                 # Write store data to BRAM.  This happens one
1115                 # cycle after the store is in r0.
1116                 comb += wr_data.eq(r1.req.data)
1117                 comb += wr_sel.eq(r1.req.byte_sel)
1118                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1119
1120                 with m.If(i == r1.req.hit_way):
1121                     comb += do_write.eq(1)
1122             with m.Else():
1123                 # Otherwise, we might be doing a reload or a DCBZ
1124                 with m.If(r1.dcbz):
1125                     comb += wr_data.eq(0)
1126                 with m.Else():
1127                     comb += wr_data.eq(wb_in.dat)
1128                 comb += wr_addr.eq(r1.store_row)
1129                 comb += wr_sel.eq(~0) # all 1s
1130
1131             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1132                       & wb_in.ack & (replace_way == i)):
1133                 comb += do_write.eq(1)
1134
1135             # Mask write selects with do_write since BRAM
1136             # doesn't have a global write-enable
1137             with m.If(do_write):
1138                 comb += wr_sel_m.eq(wr_sel)
1139
1140     # Cache hit synchronous machine for the easy case.
1141     # This handles load hits.
1142     # It also handles error cases (TLB miss, cache paradox)
1143     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1144                         req_hit_way, req_index, req_tag, access_ok,
1145                         tlb_hit, tlb_hit_way, tlb_req_index):
1146
1147         comb = m.d.comb
1148         sync = m.d.sync
1149
1150         with m.If(req_op != Op.OP_NONE):
1151             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1152                     req_op, r0.req.addr, r0.req.nc,
1153                     req_index, req_tag, req_hit_way)
1154
1155         with m.If(r0_valid):
1156             sync += r1.mmu_req.eq(r0.mmu_req)
1157
1158         # Fast path for load/store hits.
1159         # Set signals for the writeback controls.
1160         sync += r1.hit_way.eq(req_hit_way)
1161         sync += r1.hit_index.eq(req_index)
1162
1163         with m.If(req_op == Op.OP_LOAD_HIT):
1164             sync += r1.hit_load_valid.eq(1)
1165         with m.Else():
1166             sync += r1.hit_load_valid.eq(0)
1167
1168         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1169             sync += r1.cache_hit.eq(1)
1170         with m.Else():
1171             sync += r1.cache_hit.eq(0)
1172
1173         with m.If(req_op == Op.OP_BAD):
1174             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1175             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1176             sync += r1.ls_error.eq(~r0.mmu_req)
1177             sync += r1.mmu_error.eq(r0.mmu_req)
1178             sync += r1.cache_paradox.eq(access_ok)
1179
1180             with m.Else():
1181                 sync += r1.ls_error.eq(0)
1182                 sync += r1.mmu_error.eq(0)
1183                 sync += r1.cache_paradox.eq(0)
1184
1185         with m.If(req_op == Op.OP_STCX_FAIL):
1186             sync += r1.stcx_fail.eq(1)
1187         with m.Else():
1188             sync += r1.stcx_fail.eq(0)
1189
1190         # Record TLB hit information for updating TLB PLRU
1191         sync += r1.tlb_hit.eq(tlb_hit)
1192         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1193         sync += r1.tlb_hit_index.eq(tlb_req_index)
1194
1195     # Memory accesses are handled by this state machine:
1196     #
1197     #   * Cache load miss/reload (in conjunction with "rams")
1198     #   * Load hits for non-cachable forms
1199     #   * Stores (the collision case is handled in "rams")
1200     #
1201     # All wishbone requests generation is done here.
1202     # This machine operates at stage 1.
1203     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1204                     cache_valids, r0, replace_way,
1205                     req_hit_way, req_same_tag,
1206                     r0_valid, req_op, cache_tags, req_go, ra):
1207
1208         comb = m.d.comb
1209         sync = m.d.sync
1210         wb_in = self.wb_in
1211         d_in = self.d_in
1212
1213         req         = MemAccessRequest("mreq_ds")
1214
1215         req_row = Signal(ROW_BITS)
1216         req_idx = Signal(INDEX_BITS)
1217         req_tag = Signal(TAG_BITS)
1218         comb += req_idx.eq(get_index(req.real_addr))
1219         comb += req_row.eq(get_row(req.real_addr))
1220         comb += req_tag.eq(get_tag(req.real_addr))
1221
1222         sync += r1.use_forward1.eq(use_forward1_next)
1223         sync += r1.forward_sel.eq(0)
1224
1225         with m.If(use_forward1_next):
1226             sync += r1.forward_sel.eq(r1.req.byte_sel)
1227         with m.Elif(use_forward2_next):
1228             sync += r1.forward_sel.eq(r1.forward_sel1)
1229
1230         sync += r1.forward_data2.eq(r1.forward_data1)
1231         with m.If(r1.write_bram):
1232             sync += r1.forward_data1.eq(r1.req.data)
1233             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1234             sync += r1.forward_way1.eq(r1.req.hit_way)
1235             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1236             sync += r1.forward_valid1.eq(1)
1237         with m.Else():
1238             with m.If(r1.dcbz):
1239                 sync += r1.forward_data1.eq(0)
1240             with m.Else():
1241                 sync += r1.forward_data1.eq(wb_in.dat)
1242             sync += r1.forward_sel1.eq(~0) # all 1s
1243             sync += r1.forward_way1.eq(replace_way)
1244             sync += r1.forward_row1.eq(r1.store_row)
1245             sync += r1.forward_valid1.eq(0)
1246
1247         # One cycle pulses reset
1248         sync += r1.slow_valid.eq(0)
1249         sync += r1.write_bram.eq(0)
1250         sync += r1.inc_acks.eq(0)
1251         sync += r1.dec_acks.eq(0)
1252
1253         sync += r1.ls_valid.eq(0)
1254         # complete tlbies and TLB loads in the third cycle
1255         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1256
1257         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1258             with m.If(~r0.mmu_req):
1259                 sync += r1.ls_valid.eq(1)
1260             with m.Else():
1261                 sync += r1.mmu_done.eq(1)
1262
1263         with m.If(r1.write_tag):
1264             # Store new tag in selected way
1265             for i in range(NUM_WAYS):
1266                 with m.If(i == replace_way):
1267                     ct = Signal(TAG_RAM_WIDTH)
1268                     comb += ct.eq(cache_tags[r1.store_index])
1269                     """
1270 TODO: check this
1271 cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
1272                     (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
1273                     """
1274                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1275                     sync += cache_tags[r1.store_index].eq(ct)
1276             sync += r1.store_way.eq(replace_way)
1277             sync += r1.write_tag.eq(0)
1278
1279         # Take request from r1.req if there is one there,
1280         # else from req_op, ra, etc.
1281         with m.If(r1.full):
1282             comb += req.eq(r1.req)
1283         with m.Else():
1284             comb += req.op.eq(req_op)
1285             comb += req.valid.eq(req_go)
1286             comb += req.mmu_req.eq(r0.mmu_req)
1287             comb += req.dcbz.eq(r0.req.dcbz)
1288             comb += req.real_addr.eq(ra)
1289
1290             with m.If(r0.req.dcbz):
1291                 # force data to 0 for dcbz
1292                 comb += req.data.eq(0)
1293             with m.Elif(r0.d_valid):
1294                 comb += req.data.eq(r0.req.data)
1295             with m.Else():
1296                 comb += req.data.eq(d_in.data)
1297
1298             # Select all bytes for dcbz
1299             # and for cacheable loads
1300             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1301                 comb += req.byte_sel.eq(~0) # all 1s
1302             with m.Else():
1303                 comb += req.byte_sel.eq(r0.req.byte_sel)
1304             comb += req.hit_way.eq(req_hit_way)
1305             comb += req.same_tag.eq(req_same_tag)
1306
1307             # Store the incoming request from r0,
1308             # if it is a slow request
1309             # Note that r1.full = 1 implies req_op = OP_NONE
1310             with m.If((req_op == Op.OP_LOAD_MISS)
1311                       | (req_op == Op.OP_LOAD_NC)
1312                       | (req_op == Op.OP_STORE_MISS)
1313                       | (req_op == Op.OP_STORE_HIT)):
1314                 sync += r1.req.eq(req)
1315                 sync += r1.full.eq(1)
1316
1317         # Main state machine
1318         with m.Switch(r1.state):
1319
1320             with m.Case(State.IDLE):
1321                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1322                 sync += r1.wb.sel.eq(req.byte_sel)
1323                 sync += r1.wb.dat.eq(req.data)
1324                 sync += r1.dcbz.eq(req.dcbz)
1325
1326                 # Keep track of our index and way
1327                 # for subsequent stores.
1328                 sync += r1.store_index.eq(req_idx)
1329                 sync += r1.store_row.eq(req_row)
1330                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1331                 sync += r1.reload_tag.eq(req_tag)
1332                 sync += r1.req.same_tag.eq(1)
1333
1334                 with m.If(req.op == Op.OP_STORE_HIT):
1335                     sync += r1.store_way.eq(req.hit_way)
1336
1337                 # Reset per-row valid bits,
1338                 # ready for handling OP_LOAD_MISS
1339                 for i in range(ROW_PER_LINE):
1340                     sync += r1.rows_valid[i].eq(0)
1341
1342                 with m.If(req_op != Op.OP_NONE):
1343                     sync += Display("cache op %d", req.op)
1344
1345                 with m.Switch(req.op):
1346                     with m.Case(Op.OP_LOAD_HIT):
1347                         # stay in IDLE state
1348                         pass
1349
1350                     with m.Case(Op.OP_LOAD_MISS):
1351                         sync += Display("cache miss real addr: %x " \
1352                                 "idx: %x tag: %x",
1353                                 req.real_addr, req_row, req_tag)
1354
1355                         # Start the wishbone cycle
1356                         sync += r1.wb.we.eq(0)
1357                         sync += r1.wb.cyc.eq(1)
1358                         sync += r1.wb.stb.eq(1)
1359
1360                         # Track that we had one request sent
1361                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1362                         sync += r1.write_tag.eq(1)
1363
1364                     with m.Case(Op.OP_LOAD_NC):
1365                         sync += r1.wb.cyc.eq(1)
1366                         sync += r1.wb.stb.eq(1)
1367                         sync += r1.wb.we.eq(0)
1368                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1369
1370                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1371                         with m.If(~req.dcbz):
1372                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1373                             sync += r1.acks_pending.eq(1)
1374                             sync += r1.full.eq(0)
1375                             sync += r1.slow_valid.eq(1)
1376
1377                             with m.If(~req.mmu_req):
1378                                 sync += r1.ls_valid.eq(1)
1379                             with m.Else():
1380                                 sync += r1.mmu_done.eq(1)
1381
1382                             with m.If(req.op == Op.OP_STORE_HIT):
1383                                 sync += r1.write_bram.eq(1)
1384                         with m.Else():
1385                             # dcbz is handled much like a load miss except
1386                             # that we are writing to memory instead of reading
1387                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1388
1389                             with m.If(req.op == Op.OP_STORE_MISS):
1390                                 sync += r1.write_tag.eq(1)
1391
1392                         sync += r1.wb.we.eq(1)
1393                         sync += r1.wb.cyc.eq(1)
1394                         sync += r1.wb.stb.eq(1)
1395
1396                     # OP_NONE and OP_BAD do nothing
1397                     # OP_BAD & OP_STCX_FAIL were
1398                     # handled above already
1399                     with m.Case(Op.OP_NONE):
1400                         pass
1401                     with m.Case(Op.OP_BAD):
1402                         pass
1403                     with m.Case(Op.OP_STCX_FAIL):
1404                         pass
1405
1406             with m.Case(State.RELOAD_WAIT_ACK):
1407                 ld_stbs_done = Signal()
1408                 # Requests are all sent if stb is 0
1409                 comb += ld_stbs_done.eq(~r1.wb.stb)
1410
1411                 # If we are still sending requests, was one accepted?
1412                 with m.If((~wb_in.stall) & r1.wb.stb):
1413                     # That was the last word?  We are done sending.
1414                     # Clear stb and set ld_stbs_done so we can handle an
1415                     # eventual last ack on the same cycle.
1416                     # sigh - reconstruct wb adr with 3 extra 0s at front
1417                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1418                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1419                         sync += r1.wb.stb.eq(0)
1420                         comb += ld_stbs_done.eq(1)
1421
1422                     # Calculate the next row address in the current cache line
1423                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1424                     comb += row.eq(r1.wb.adr)
1425                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1426
1427                 # Incoming acks processing
1428                 sync += r1.forward_valid1.eq(wb_in.ack)
1429                 with m.If(wb_in.ack):
1430                     srow = Signal(ROW_LINE_BITS)
1431                     comb += srow.eq(r1.store_row)
1432                     sync += r1.rows_valid[srow].eq(1)
1433
1434                     # If this is the data we were looking for,
1435                     # we can complete the request next cycle.
1436                     # Compare the whole address in case the
1437                     # request in r1.req is not the one that
1438                     # started this refill.
1439                     with m.If(req.valid & r1.req.same_tag &
1440                               ((r1.dcbz & r1.req.dcbz) |
1441                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1442                                 (r1.store_row == get_row(req.real_addr))):
1443                         sync += r1.full.eq(0)
1444                         sync += r1.slow_valid.eq(1)
1445                         with m.If(~r1.mmu_req):
1446                             sync += r1.ls_valid.eq(1)
1447                         with m.Else():
1448                             sync += r1.mmu_done.eq(1)
1449                         sync += r1.forward_sel.eq(~0) # all 1s
1450                         sync += r1.use_forward1.eq(1)
1451
1452                     # Check for completion
1453                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1454                                                       r1.end_row_ix)):
1455                         # Complete wishbone cycle
1456                         sync += r1.wb.cyc.eq(0)
1457
1458                         # Cache line is now valid
1459                         cv = Signal(INDEX_BITS)
1460                         comb += cv.eq(cache_valids[r1.store_index])
1461                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1462                         sync += cache_valids[r1.store_index].eq(cv)
1463
1464                         sync += r1.state.eq(State.IDLE)
1465
1466                     # Increment store row counter
1467                     sync += r1.store_row.eq(next_row(r1.store_row))
1468
1469             with m.Case(State.STORE_WAIT_ACK):
1470                 st_stbs_done = Signal()
1471                 acks        = Signal(3)
1472                 adjust_acks = Signal(3)
1473
1474                 comb += st_stbs_done.eq(~r1.wb.stb)
1475                 comb += acks.eq(r1.acks_pending)
1476
1477                 with m.If(r1.inc_acks != r1.dec_acks):
1478                     with m.If(r1.inc_acks):
1479                         comb += adjust_acks.eq(acks + 1)
1480                     with m.Else():
1481                         comb += adjust_acks.eq(acks - 1)
1482                 with m.Else():
1483                     comb += adjust_acks.eq(acks)
1484
1485                 sync += r1.acks_pending.eq(adjust_acks)
1486
1487                 # Clear stb when slave accepted request
1488                 with m.If(~wb_in.stall):
1489                     # See if there is another store waiting
1490                     # to be done which is in the same real page.
1491                     with m.If(req.valid):
1492                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1493                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1494                         sync += r1.wb.dat.eq(req.data)
1495                         sync += r1.wb.sel.eq(req.byte_sel)
1496
1497                     with m.If((adjust_acks < 7) & req.same_tag &
1498                                 ((req.op == Op.OP_STORE_MISS)
1499                                  | (req.op == Op.OP_STORE_HIT))):
1500                         sync += r1.wb.stb.eq(1)
1501                         comb += st_stbs_done.eq(0)
1502
1503                         with m.If(req.op == Op.OP_STORE_HIT):
1504                             sync += r1.write_bram.eq(1)
1505                         sync += r1.full.eq(0)
1506                         sync += r1.slow_valid.eq(1)
1507
1508                         # Store requests never come from the MMU
1509                         sync += r1.ls_valid.eq(1)
1510                         comb += st_stbs_done.eq(0)
1511                         sync += r1.inc_acks.eq(1)
1512                     with m.Else():
1513                         sync += r1.wb.stb.eq(0)
1514                         comb += st_stbs_done.eq(1)
1515
1516                 # Got ack ? See if complete.
1517                 with m.If(wb_in.ack):
1518                     with m.If(st_stbs_done & (adjust_acks == 1)):
1519                         sync += r1.state.eq(State.IDLE)
1520                         sync += r1.wb.cyc.eq(0)
1521                         sync += r1.wb.stb.eq(0)
1522                     sync += r1.dec_acks.eq(1)
1523
1524             with m.Case(State.NC_LOAD_WAIT_ACK):
1525                 # Clear stb when slave accepted request
1526                 with m.If(~wb_in.stall):
1527                     sync += r1.wb.stb.eq(0)
1528
1529                 # Got ack ? complete.
1530                 with m.If(wb_in.ack):
1531                     sync += r1.state.eq(State.IDLE)
1532                     sync += r1.full.eq(0)
1533                     sync += r1.slow_valid.eq(1)
1534
1535                     with m.If(~r1.mmu_req):
1536                         sync += r1.ls_valid.eq(1)
1537                     with m.Else():
1538                         sync += r1.mmu_done.eq(1)
1539
1540                     sync += r1.forward_sel.eq(~0) # all 1s
1541                     sync += r1.use_forward1.eq(1)
1542                     sync += r1.wb.cyc.eq(0)
1543                     sync += r1.wb.stb.eq(0)
1544
1545     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1546
1547         sync = m.d.sync
1548         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1549
1550         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1551                                stall_out, req_op[:3], d_out.valid, d_out.error,
1552                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1553                                r1.real_adr[3:6]))
1554
1555     def elaborate(self, platform):
1556
1557         m = Module()
1558         comb = m.d.comb
1559         d_in = self.d_in
1560
1561         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1562         cache_tags       = CacheTagArray()
1563         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1564         cache_valids = CacheValidBitsArray()
1565
1566         # TODO attribute ram_style : string;
1567         # TODO attribute ram_style of cache_tags : signal is "distributed";
1568
1569         """note: these are passed to nmigen.hdl.Memory as "attributes".
1570            don't know how, just that they are.
1571         """
1572         dtlb_valid_bits = TLBValidBitsArray()
1573         dtlb_tags       = TLBTagsArray()
1574         dtlb_ptes       = TLBPtesArray()
1575         # TODO attribute ram_style of
1576         #  dtlb_tags : signal is "distributed";
1577         # TODO attribute ram_style of
1578         #  dtlb_ptes : signal is "distributed";
1579
1580         r0      = RegStage0("r0")
1581         r0_full = Signal()
1582
1583         r1 = RegStage1("r1")
1584
1585         reservation = Reservation()
1586
1587         # Async signals on incoming request
1588         req_index    = Signal(INDEX_BITS)
1589         req_row      = Signal(ROW_BITS)
1590         req_hit_way  = Signal(WAY_BITS)
1591         req_tag      = Signal(TAG_BITS)
1592         req_op       = Signal(Op)
1593         req_data     = Signal(64)
1594         req_same_tag = Signal()
1595         req_go       = Signal()
1596
1597         early_req_row     = Signal(ROW_BITS)
1598
1599         cancel_store      = Signal()
1600         set_rsrv          = Signal()
1601         clear_rsrv        = Signal()
1602
1603         r0_valid          = Signal()
1604         r0_stall          = Signal()
1605
1606         use_forward1_next = Signal()
1607         use_forward2_next = Signal()
1608
1609         cache_out_row     = Signal(WB_DATA_BITS)
1610
1611         plru_victim       = PLRUOut()
1612         replace_way       = Signal(WAY_BITS)
1613
1614         # Wishbone read/write/cache write formatting signals
1615         bus_sel           = Signal(8)
1616
1617         # TLB signals
1618         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1619         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1620         tlb_valid_way = Signal(TLB_NUM_WAYS)
1621         tlb_req_index = Signal(TLB_SET_BITS)
1622         tlb_hit       = Signal()
1623         tlb_hit_way   = Signal(TLB_WAY_BITS)
1624         pte           = Signal(TLB_PTE_BITS)
1625         ra            = Signal(REAL_ADDR_BITS)
1626         valid_ra      = Signal()
1627         perm_attr     = PermAttr("dc_perms")
1628         rc_ok         = Signal()
1629         perm_ok       = Signal()
1630         access_ok     = Signal()
1631
1632         tlb_plru_victim = TLBPLRUOut()
1633
1634         # we don't yet handle collisions between loadstore1 requests
1635         # and MMU requests
1636         comb += self.m_out.stall.eq(0)
1637
1638         # Hold off the request in r0 when r1 has an uncompleted request
1639         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1640         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1641         comb += self.stall_out.eq(r0_stall)
1642
1643         # Wire up wishbone request latch out of stage 1
1644         comb += self.wb_out.eq(r1.wb)
1645
1646         # deal with litex not doing wishbone pipeline mode
1647         comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
1648
1649         # call sub-functions putting everything together, using shared
1650         # signals established above
1651         self.stage_0(m, r0, r1, r0_full)
1652         self.tlb_read(m, r0_stall, tlb_valid_way,
1653                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1654                       dtlb_tags, dtlb_ptes)
1655         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1656                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1657                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1658         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1659                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1660                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1661         self.maybe_plrus(m, r1, plru_victim)
1662         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1663         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1664         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1665                            r0_valid, r1, cache_valids, replace_way,
1666                            use_forward1_next, use_forward2_next,
1667                            req_hit_way, plru_victim, rc_ok, perm_attr,
1668                            valid_ra, perm_ok, access_ok, req_op, req_go,
1669                            tlb_pte_way,
1670                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1671                            cancel_store, req_same_tag, r0_stall, early_req_row)
1672         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1673                            r0_valid, r0, reservation)
1674         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1675                            reservation, r0)
1676         self.writeback_control(m, r1, cache_out_row)
1677         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1678         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1679                         req_hit_way, req_index, req_tag, access_ok,
1680                         tlb_hit, tlb_hit_way, tlb_req_index)
1681         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1682                     cache_valids, r0, replace_way,
1683                     req_hit_way, req_same_tag,
1684                          r0_valid, req_op, cache_tags, req_go, ra)
1685         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1686
1687         return m
1688
1689 def dcache_load(dut, addr, nc=0):
1690     yield dut.d_in.load.eq(1)
1691     yield dut.d_in.nc.eq(nc)
1692     yield dut.d_in.addr.eq(addr)
1693     yield dut.d_in.byte_sel.eq(~0)
1694     yield dut.d_in.valid.eq(1)
1695     yield
1696     yield dut.d_in.valid.eq(0)
1697     yield dut.d_in.byte_sel.eq(0)
1698     while not (yield dut.d_out.valid):
1699         yield
1700     # yield # data is valid one cycle AFTER valid goes hi? (no it isn't)
1701     data = yield dut.d_out.data
1702     return data
1703
1704
1705 def dcache_store(dut, addr, data, nc=0):
1706     yield dut.d_in.load.eq(0)
1707     yield dut.d_in.nc.eq(nc)
1708     yield dut.d_in.data.eq(data)
1709     yield dut.d_in.byte_sel.eq(~0)
1710     yield dut.d_in.addr.eq(addr)
1711     yield dut.d_in.valid.eq(1)
1712     yield
1713     yield dut.d_in.valid.eq(0)
1714     yield dut.d_in.byte_sel.eq(0)
1715     while not (yield dut.d_out.valid):
1716         yield
1717
1718
1719 def dcache_random_sim(dut, mem):
1720
1721     # start copy of mem
1722     sim_mem = deepcopy(mem)
1723     memsize = len(sim_mem)
1724     print ("mem len", memsize)
1725
1726     # clear stuff
1727     yield dut.d_in.valid.eq(0)
1728     yield dut.d_in.load.eq(0)
1729     yield dut.d_in.priv_mode.eq(1)
1730     yield dut.d_in.nc.eq(0)
1731     yield dut.d_in.addr.eq(0)
1732     yield dut.d_in.data.eq(0)
1733     yield dut.m_in.valid.eq(0)
1734     yield dut.m_in.addr.eq(0)
1735     yield dut.m_in.pte.eq(0)
1736     # wait 4 * clk_period
1737     yield
1738     yield
1739     yield
1740     yield
1741
1742     print ()
1743
1744     #for i in range(1024):
1745     #    sim_mem[i] = i
1746
1747     for i in range(1024):
1748         addr = randint(0, memsize-1)
1749         data = randint(0, (1<<64)-1)
1750         sim_mem[addr] = data
1751         row = addr
1752         addr *= 8
1753
1754         print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
1755
1756         yield from dcache_load(dut, addr)
1757         yield from dcache_store(dut, addr, data)
1758
1759         addr = randint(0, memsize-1)
1760         sim_data = sim_mem[addr]
1761         row = addr
1762         addr *= 8
1763
1764         print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1765         data = yield from dcache_load(dut, addr)
1766         assert data == sim_data, \
1767             "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1768
1769     for addr in range(memsize):
1770         data = yield from dcache_load(dut, addr*8)
1771         assert data == sim_mem[addr], \
1772             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1773
1774 def dcache_regression_sim(dut, mem):
1775
1776     # start copy of mem
1777     sim_mem = deepcopy(mem)
1778     memsize = len(sim_mem)
1779     print ("mem len", memsize)
1780
1781     # clear stuff
1782     yield dut.d_in.valid.eq(0)
1783     yield dut.d_in.load.eq(0)
1784     yield dut.d_in.priv_mode.eq(1)
1785     yield dut.d_in.nc.eq(0)
1786     yield dut.d_in.addr.eq(0)
1787     yield dut.d_in.data.eq(0)
1788     yield dut.m_in.valid.eq(0)
1789     yield dut.m_in.addr.eq(0)
1790     yield dut.m_in.pte.eq(0)
1791     # wait 4 * clk_period
1792     yield
1793     yield
1794     yield
1795     yield
1796
1797     addr = 1
1798     row = addr
1799     addr *= 8
1800
1801     print ("random testing %d 0x%x row %d" % (i, addr, row))
1802
1803     yield from dcache_load(dut, addr)
1804
1805     addr = 2
1806     sim_data = sim_mem[addr]
1807     row = addr
1808     addr *= 8
1809
1810     print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1811     data = yield from dcache_load(dut, addr)
1812     assert data == sim_data, \
1813         "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1814
1815
1816
1817 def dcache_sim(dut, mem):
1818     # clear stuff
1819     yield dut.d_in.valid.eq(0)
1820     yield dut.d_in.load.eq(0)
1821     yield dut.d_in.priv_mode.eq(1)
1822     yield dut.d_in.nc.eq(0)
1823     yield dut.d_in.addr.eq(0)
1824     yield dut.d_in.data.eq(0)
1825     yield dut.m_in.valid.eq(0)
1826     yield dut.m_in.addr.eq(0)
1827     yield dut.m_in.pte.eq(0)
1828     # wait 4 * clk_period
1829     yield
1830     yield
1831     yield
1832     yield
1833
1834     # Cacheable read of address 4
1835     data = yield from dcache_load(dut, 0x58)
1836     addr = yield dut.d_in.addr
1837     assert data == 0x0000001700000016, \
1838         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1839
1840     # Cacheable read of address 20
1841     data = yield from dcache_load(dut, 0x20)
1842     addr = yield dut.d_in.addr
1843     assert data == 0x0000000900000008, \
1844         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1845
1846     # Cacheable read of address 30
1847     data = yield from dcache_load(dut, 0x530)
1848     addr = yield dut.d_in.addr
1849     assert data == 0x0000014D0000014C, \
1850         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1851
1852     # 2nd Cacheable read of address 30
1853     data = yield from dcache_load(dut, 0x530)
1854     addr = yield dut.d_in.addr
1855     assert data == 0x0000014D0000014C, \
1856         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1857
1858     # Non-cacheable read of address 100
1859     data = yield from dcache_load(dut, 0x100, nc=1)
1860     addr = yield dut.d_in.addr
1861     assert data == 0x0000004100000040, \
1862         f"data @%x=%x expected 0000004100000040" % (addr, data)
1863
1864     # Store at address 530
1865     yield from dcache_store(dut, 0x530, 0x121)
1866
1867     # Store at address 30
1868     yield from dcache_store(dut, 0x530, 0x12345678)
1869
1870     # 3nd Cacheable read of address 530
1871     data = yield from dcache_load(dut, 0x530)
1872     addr = yield dut.d_in.addr
1873     assert data == 0x12345678, \
1874         f"data @%x=%x expected 0x12345678" % (addr, data)
1875
1876     # 4th Cacheable read of address 20
1877     data = yield from dcache_load(dut, 0x20)
1878     addr = yield dut.d_in.addr
1879     assert data == 0x0000000900000008, \
1880         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1881
1882     yield
1883     yield
1884     yield
1885     yield
1886
1887
1888 def test_dcache(mem, test_fn, test_name):
1889     dut = DCache()
1890
1891     memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
1892     sram = SRAM(memory=memory, granularity=8)
1893
1894     m = Module()
1895     m.submodules.dcache = dut
1896     m.submodules.sram = sram
1897
1898     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1899     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1900     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1901     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1902     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1903     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1904
1905     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1906     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1907
1908     # nmigen Simulation
1909     sim = Simulator(m)
1910     sim.add_clock(1e-6)
1911
1912     sim.add_sync_process(wrap(test_fn(dut, mem)))
1913     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1914         sim.run()
1915
1916 if __name__ == '__main__':
1917     seed(0)
1918     dut = DCache()
1919     vl = rtlil.convert(dut, ports=[])
1920     with open("test_dcache.il", "w") as f:
1921         f.write(vl)
1922
1923     mem = []
1924     memsize = 16
1925     for i in range(memsize):
1926         mem.append(i)
1927
1928     test_dcache(mem, dcache_regression_sim, "random")
1929
1930     exit(0)
1931
1932     mem = []
1933     memsize = 256
1934     for i in range(memsize):
1935         mem.append(i)
1936
1937     test_dcache(mem, dcache_random_sim, "random")
1938
1939     mem = []
1940     for i in range(1024):
1941         mem.append((i*2)| ((i*2+1)<<32))
1942
1943     test_dcache(mem, dcache_sim, "")
1944