src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 from nmutil.util import Display
  11
  12 from random import randint
  13
  14 from nmigen.cli import main
  15 from nmutil.iocontrol import RecordObject
  16 from nmigen.utils import log2_int
  17 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  18                                      DCacheToLoadStore1Type,
  19                                      MMUToDCacheType,
  20                                      DCacheToMMUType)
  21
  22 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  23                                 WBAddrType, WBDataType, WBSelType,
  24                                 WBMasterOut, WBSlaveOut,
  25                                 WBMasterOutVector, WBSlaveOutVector,
  26                                 WBIOMasterOut, WBIOSlaveOut)
  27
  28 from soc.experiment.cache_ram import CacheRam
  29 #from soc.experiment.plru import PLRU
  30 from nmutil.plru import PLRU
  31
  32 # for test
  33 from nmigen_soc.wishbone.sram import SRAM
  34 from nmigen import Memory
  35 from nmigen.cli import rtlil
  36
  37 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  38 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  39 from nmutil.sim_tmp_alternative import Simulator
  40
  41 from nmutil.util import wrap
  42
  43
  44 # TODO: make these parameters of DCache at some point
  45 LINE_SIZE = 64    # Line size in bytes
  46 NUM_LINES = 16    # Number of lines in a set
  47 NUM_WAYS = 4      # Number of ways
  48 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  49 TLB_NUM_WAYS = 4  # L1 DTLB number of sets
  50 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  51 LOG_LENGTH = 0    # Non-zero to enable log data collection
  52
  53 # BRAM organisation: We never access more than
  54 #     -- WB_DATA_BITS at a time so to save
  55 #     -- resources we make the array only that wide, and
  56 #     -- use consecutive indices for to make a cache "line"
  57 #     --
  58 #     -- ROW_SIZE is the width in bytes of the BRAM
  59 #     -- (based on WB, so 64-bits)
  60 ROW_SIZE = WB_DATA_BITS // 8;
  61
  62 # ROW_PER_LINE is the number of row (wishbone
  63 # transactions) in a line
  64 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  65
  66 # BRAM_ROWS is the number of rows in BRAM needed
  67 # to represent the full dcache
  68 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  69
  70 print ("ROW_SIZE", ROW_SIZE)
  71 print ("ROW_PER_LINE", ROW_PER_LINE)
  72 print ("BRAM_ROWS", BRAM_ROWS)
  73 print ("NUM_WAYS", NUM_WAYS)
  74
  75 # Bit fields counts in the address
  76
  77 # REAL_ADDR_BITS is the number of real address
  78 # bits that we store
  79 REAL_ADDR_BITS = 56
  80
  81 # ROW_BITS is the number of bits to select a row
  82 ROW_BITS = log2_int(BRAM_ROWS)
  83
  84 # ROW_LINE_BITS is the number of bits to select
  85 # a row within a line
  86 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  87
  88 # LINE_OFF_BITS is the number of bits for
  89 # the offset in a cache line
  90 LINE_OFF_BITS = log2_int(LINE_SIZE)
  91
  92 # ROW_OFF_BITS is the number of bits for
  93 # the offset in a row
  94 ROW_OFF_BITS = log2_int(ROW_SIZE)
  95
  96 # INDEX_BITS is the number if bits to
  97 # select a cache line
  98 INDEX_BITS = log2_int(NUM_LINES)
  99
 100 # SET_SIZE_BITS is the log base 2 of the set size
 101 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 102
 103 # TAG_BITS is the number of bits of
 104 # the tag part of the address
 105 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 106
 107 # TAG_WIDTH is the width in bits of each way of the tag RAM
 108 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 109
 110 # WAY_BITS is the number of bits to select a way
 111 WAY_BITS = log2_int(NUM_WAYS)
 112
 113 # Example of layout for 32 lines of 64 bytes:
 114 layout = """\
 115   ..  tag    |index|  line  |
 116   ..         |   row   |    |
 117   ..         |     |---|    | ROW_LINE_BITS  (3)
 118   ..         |     |--- - --| LINE_OFF_BITS (6)
 119   ..         |         |- --| ROW_OFF_BITS  (3)
 120   ..         |----- ---|    | ROW_BITS      (8)
 121   ..         |-----|        | INDEX_BITS    (5)
 122   .. --------|              | TAG_BITS      (45)
 123 """
 124 print (layout)
 125 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 126             (TAG_BITS, INDEX_BITS, ROW_BITS,
 127              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 128 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 129 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 130 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 131
 132 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 133
 134 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 135
 136 def CacheTagArray():
 137     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 138                         for x in range(NUM_LINES))
 139
 140 def CacheValidBitsArray():
 141     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 142                         for x in range(NUM_LINES))
 143
 144 def RowPerLineValidArray():
 145     return Array(Signal(name="rows_valid%d" % x) \
 146                         for x in range(ROW_PER_LINE))
 147
 148 # L1 TLB
 149 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 150 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 151 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 152 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 153 TLB_PTE_BITS     = 64
 154 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 155
 156 def ispow2(x):
 157     return (1<<log2_int(x, False)) == x
 158
 159 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 160 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 161 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 162 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 163 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 164 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 165         "geometry bits don't add up"
 166 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 167         "geometry bits don't add up"
 168 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 169          "geometry bits don't add up"
 170 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 171 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 172
 173
 174 def TLBValidBitsArray():
 175     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 176                 for x in range(TLB_SET_SIZE))
 177
 178 def TLBTagEAArray():
 179     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 180                 for x in range (TLB_NUM_WAYS))
 181
 182 def TLBTagsArray():
 183     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 184                 for x in range (TLB_SET_SIZE))
 185
 186 def TLBPtesArray():
 187     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 188                 for x in range(TLB_SET_SIZE))
 189
 190 def HitWaySet():
 191     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 192                         for x in range(TLB_NUM_WAYS))
 193
 194 # Cache RAM interface
 195 def CacheRamOut():
 196     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 197                  for x in range(NUM_WAYS))
 198
 199 # PLRU output interface
 200 def PLRUOut():
 201     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 202                 for x in range(NUM_LINES))
 203
 204 # TLB PLRU output interface
 205 def TLBPLRUOut():
 206     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 207                 for x in range(TLB_SET_SIZE))
 208
 209 # Helper functions to decode incoming requests
 210 #
 211 # Return the cache line index (tag index) for an address
 212 def get_index(addr):
 213     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 214
 215 # Return the cache row index (data memory) for an address
 216 def get_row(addr):
 217     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 218
 219 # Return the index of a row within a line
 220 def get_row_of_line(row):
 221     return row[:ROW_BITS][:ROW_LINE_BITS]
 222
 223 # Returns whether this is the last row of a line
 224 def is_last_row_addr(addr, last):
 225     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 226
 227 # Returns whether this is the last row of a line
 228 def is_last_row(row, last):
 229     return get_row_of_line(row) == last
 230
 231 # Return the next row in the current cache line. We use a
 232 # dedicated function in order to limit the size of the
 233 # generated adder to be only the bits within a cache line
 234 # (3 bits with default settings)
 235 def next_row(row):
 236     row_v = row[0:ROW_LINE_BITS] + 1
 237     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 238
 239 # Get the tag value from the address
 240 def get_tag(addr):
 241     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 242
 243 # Read a tag from a tag memory row
 244 def read_tag(way, tagset):
 245     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 246
 247 # Read a TLB tag from a TLB tag memory row
 248 def read_tlb_tag(way, tags):
 249     return tags.word_select(way, TLB_EA_TAG_BITS)
 250
 251 # Write a TLB tag to a TLB tag memory row
 252 def write_tlb_tag(way, tags, tag):
 253     return read_tlb_tag(way, tags).eq(tag)
 254
 255 # Read a PTE from a TLB PTE memory row
 256 def read_tlb_pte(way, ptes):
 257     return ptes.word_select(way, TLB_PTE_BITS)
 258
 259 def write_tlb_pte(way, ptes, newpte):
 260     return read_tlb_pte(way, ptes).eq(newpte)
 261
 262
 263 # Record for storing permission, attribute, etc. bits from a PTE
 264 class PermAttr(RecordObject):
 265     def __init__(self, name=None):
 266         super().__init__(name=name)
 267         self.reference = Signal()
 268         self.changed   = Signal()
 269         self.nocache   = Signal()
 270         self.priv      = Signal()
 271         self.rd_perm   = Signal()
 272         self.wr_perm   = Signal()
 273
 274
 275 def extract_perm_attr(pte):
 276     pa = PermAttr()
 277     return pa;
 278
 279
 280 # Type of operation on a "valid" input
 281 @unique
 282 class Op(Enum):
 283     OP_NONE       = 0
 284     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 285     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 286     OP_LOAD_HIT   = 3 # Cache hit on load
 287     OP_LOAD_MISS  = 4 # Load missing cache
 288     OP_LOAD_NC    = 5 # Non-cachable load
 289     OP_STORE_HIT  = 6 # Store hitting cache
 290     OP_STORE_MISS = 7 # Store missing cache
 291
 292
 293 # Cache state machine
 294 @unique
 295 class State(Enum):
 296     IDLE             = 0 # Normal load hit processing
 297     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 298     STORE_WAIT_ACK   = 2 # Store wait ack
 299     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 300
 301
 302 # Dcache operations:
 303 #
 304 # In order to make timing, we use the BRAMs with
 305 # an output buffer, which means that the BRAM
 306 # output is delayed by an extra cycle.
 307 #
 308 # Thus, the dcache has a 2-stage internal pipeline
 309 # for cache hits with no stalls.
 310 #
 311 # All other operations are handled via stalling
 312 # in the first stage.
 313 #
 314 # The second stage can thus complete a hit at the same
 315 # time as the first stage emits a stall for a complex op.
 316 #
 317 # Stage 0 register, basically contains just the latched request
 318
 319 class RegStage0(RecordObject):
 320     def __init__(self, name=None):
 321         super().__init__(name=name)
 322         self.req     = LoadStore1ToDCacheType(name="lsmem")
 323         self.tlbie   = Signal()
 324         self.doall   = Signal()
 325         self.tlbld   = Signal()
 326         self.mmu_req = Signal() # indicates source of request
 327
 328
 329 class MemAccessRequest(RecordObject):
 330     def __init__(self, name=None):
 331         super().__init__(name=name)
 332         self.op        = Signal(Op)
 333         self.valid     = Signal()
 334         self.dcbz      = Signal()
 335         self.real_addr = Signal(REAL_ADDR_BITS)
 336         self.data      = Signal(64)
 337         self.byte_sel  = Signal(8)
 338         self.hit_way   = Signal(WAY_BITS)
 339         self.same_tag  = Signal()
 340         self.mmu_req   = Signal()
 341
 342
 343 # First stage register, contains state for stage 1 of load hits
 344 # and for the state machine used by all other operations
 345 class RegStage1(RecordObject):
 346     def __init__(self, name=None):
 347         super().__init__(name=name)
 348         # Info about the request
 349         self.full             = Signal() # have uncompleted request
 350         self.mmu_req          = Signal() # request is from MMU
 351         self.req              = MemAccessRequest(name="reqmem")
 352
 353         # Cache hit state
 354         self.hit_way          = Signal(WAY_BITS)
 355         self.hit_load_valid   = Signal()
 356         self.hit_index        = Signal(INDEX_BITS)
 357         self.cache_hit        = Signal()
 358
 359         # TLB hit state
 360         self.tlb_hit          = Signal()
 361         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 362         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 363
 364         # 2-stage data buffer for data forwarded from writes to reads
 365         self.forward_data1    = Signal(64)
 366         self.forward_data2    = Signal(64)
 367         self.forward_sel1     = Signal(8)
 368         self.forward_valid1   = Signal()
 369         self.forward_way1     = Signal(WAY_BITS)
 370         self.forward_row1     = Signal(ROW_BITS)
 371         self.use_forward1     = Signal()
 372         self.forward_sel      = Signal(8)
 373
 374         # Cache miss state (reload state machine)
 375         self.state            = Signal(State)
 376         self.dcbz             = Signal()
 377         self.write_bram       = Signal()
 378         self.write_tag        = Signal()
 379         self.slow_valid       = Signal()
 380         self.real_adr         = Signal(REAL_ADDR_BITS)
 381         self.wb               = WBMasterOut("wb")
 382         self.reload_tag       = Signal(TAG_BITS)
 383         self.store_way        = Signal(WAY_BITS)
 384         self.store_row        = Signal(ROW_BITS)
 385         self.store_index      = Signal(INDEX_BITS)
 386         self.end_row_ix       = Signal(ROW_LINE_BITS)
 387         self.rows_valid       = RowPerLineValidArray()
 388         self.acks_pending     = Signal(3)
 389         self.inc_acks         = Signal()
 390         self.dec_acks         = Signal()
 391
 392         # Signals to complete (possibly with error)
 393         self.ls_valid         = Signal()
 394         self.ls_error         = Signal()
 395         self.mmu_done         = Signal()
 396         self.mmu_error        = Signal()
 397         self.cache_paradox    = Signal()
 398
 399         # Signal to complete a failed stcx.
 400         self.stcx_fail        = Signal()
 401
 402
 403 # Reservation information
 404 class Reservation(RecordObject):
 405     def __init__(self):
 406         super().__init__()
 407         self.valid = Signal()
 408         self.addr  = Signal(64-LINE_OFF_BITS)
 409
 410
 411 class DTLBUpdate(Elaboratable):
 412     def __init__(self):
 413         self.tlbie    = Signal()
 414         self.tlbwe    = Signal()
 415         self.doall    = Signal()
 416         self.updated  = Signal()
 417         self.v_updated  = Signal()
 418         self.tlb_hit    = Signal()
 419         self.tlb_req_index = Signal(TLB_SET_BITS)
 420
 421         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 422         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 423         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 424         self.repl_way        = Signal(TLB_WAY_BITS)
 425         self.eatag           = Signal(TLB_EA_TAG_BITS)
 426         self.pte_data        = Signal(TLB_PTE_BITS)
 427
 428         self.dv = Signal(TLB_PTE_WAY_BITS)
 429
 430         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 431         self.pb_out = Signal(TLB_NUM_WAYS)
 432         self.db_out = Signal(TLB_PTE_WAY_BITS)
 433
 434     def elaborate(self, platform):
 435         m = Module()
 436         comb = m.d.comb
 437         sync = m.d.sync
 438
 439         tagset   = Signal(TLB_TAG_WAY_BITS)
 440         pteset   = Signal(TLB_PTE_WAY_BITS)
 441
 442         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 443
 444         with m.If(self.tlbie & self.doall):
 445             pass # clear all back in parent
 446         with m.Elif(self.tlbie):
 447             with m.If(self.tlb_hit):
 448                 comb += db_out.eq(self.dv)
 449                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 450                 comb += self.v_updated.eq(1)
 451
 452         with m.Elif(self.tlbwe):
 453
 454             comb += tagset.eq(self.tlb_tag_way)
 455             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 456             comb += tb_out.eq(tagset)
 457
 458             comb += pteset.eq(self.tlb_pte_way)
 459             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 460             comb += pb_out.eq(pteset)
 461
 462             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 463
 464             comb += self.updated.eq(1)
 465             comb += self.v_updated.eq(1)
 466
 467         return m
 468
 469
 470 class DCachePendingHit(Elaboratable):
 471
 472     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 473                       cache_valid_idx, cache_tag_set,
 474                     req_addr,
 475                     hit_set):
 476
 477         self.go          = Signal()
 478         self.virt_mode   = Signal()
 479         self.is_hit      = Signal()
 480         self.tlb_hit     = Signal()
 481         self.hit_way     = Signal(WAY_BITS)
 482         self.rel_match   = Signal()
 483         self.req_index   = Signal(INDEX_BITS)
 484         self.reload_tag  = Signal(TAG_BITS)
 485
 486         self.tlb_hit_way = tlb_hit_way
 487         self.tlb_pte_way = tlb_pte_way
 488         self.tlb_valid_way = tlb_valid_way
 489         self.cache_valid_idx = cache_valid_idx
 490         self.cache_tag_set = cache_tag_set
 491         self.req_addr = req_addr
 492         self.hit_set = hit_set
 493
 494     def elaborate(self, platform):
 495         m = Module()
 496         comb = m.d.comb
 497         sync = m.d.sync
 498
 499         go = self.go
 500         virt_mode = self.virt_mode
 501         is_hit = self.is_hit
 502         tlb_pte_way = self.tlb_pte_way
 503         tlb_valid_way = self.tlb_valid_way
 504         cache_valid_idx = self.cache_valid_idx
 505         cache_tag_set = self.cache_tag_set
 506         req_addr = self.req_addr
 507         tlb_hit_way = self.tlb_hit_way
 508         tlb_hit = self.tlb_hit
 509         hit_set = self.hit_set
 510         hit_way = self.hit_way
 511         rel_match = self.rel_match
 512         req_index = self.req_index
 513         reload_tag = self.reload_tag
 514
 515         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 516                                     for i in range(TLB_NUM_WAYS))
 517         hit_way_set = HitWaySet()
 518
 519         # Test if pending request is a hit on any way
 520         # In order to make timing in virtual mode,
 521         # when we are using the TLB, we compare each
 522         # way with each of the real addresses from each way of
 523         # the TLB, and then decide later which match to use.
 524
 525         with m.If(virt_mode):
 526             for j in range(TLB_NUM_WAYS):
 527                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 528                 s_hit       = Signal()
 529                 s_pte       = Signal(TLB_PTE_BITS)
 530                 s_ra        = Signal(REAL_ADDR_BITS)
 531                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 532                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 533                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 534                 comb += s_tag.eq(get_tag(s_ra))
 535
 536                 for i in range(NUM_WAYS):
 537                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 538                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 539                                   (read_tag(i, cache_tag_set) == s_tag)
 540                                   & tlb_valid_way[j])
 541                     with m.If(is_tag_hit):
 542                         comb += hit_way_set[j].eq(i)
 543                         comb += s_hit.eq(1)
 544                 comb += hit_set[j].eq(s_hit)
 545                 with m.If(s_tag == reload_tag):
 546                     comb += rel_matches[j].eq(1)
 547             with m.If(tlb_hit):
 548                 comb += is_hit.eq(hit_set[tlb_hit_way])
 549                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 550                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 551         with m.Else():
 552             s_tag       = Signal(TAG_BITS)
 553             comb += s_tag.eq(get_tag(req_addr))
 554             for i in range(NUM_WAYS):
 555                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 556                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 557                           (read_tag(i, cache_tag_set) == s_tag))
 558                 with m.If(is_tag_hit):
 559                     comb += hit_way.eq(i)
 560                     comb += is_hit.eq(1)
 561             with m.If(s_tag == reload_tag):
 562                 comb += rel_match.eq(1)
 563
 564         return m
 565
 566
 567 class DCache(Elaboratable):
 568     """Set associative dcache write-through
 569     TODO (in no specific order):
 570     * See list in icache.vhdl
 571     * Complete load misses on the cycle when WB data comes instead of
 572       at the end of line (this requires dealing with requests coming in
 573       while not idle...)
 574     """
 575     def __init__(self):
 576         self.d_in      = LoadStore1ToDCacheType("d_in")
 577         self.d_out     = DCacheToLoadStore1Type("d_out")
 578
 579         self.m_in      = MMUToDCacheType("m_in")
 580         self.m_out     = DCacheToMMUType("m_out")
 581
 582         self.stall_out = Signal()
 583
 584         self.wb_out    = WBMasterOut()
 585         self.wb_in     = WBSlaveOut()
 586
 587         self.log_out   = Signal(20)
 588
 589     def stage_0(self, m, r0, r1, r0_full):
 590         """Latch the request in r0.req as long as we're not stalling
 591         """
 592         comb = m.d.comb
 593         sync = m.d.sync
 594         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 595
 596         r = RegStage0("stage0")
 597
 598         # TODO, this goes in unit tests and formal proofs
 599         with m.If(d_in.valid & m_in.valid):
 600             sync += Display("request collision loadstore vs MMU")
 601
 602         with m.If(m_in.valid):
 603             sync += r.req.valid.eq(1)
 604             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 605             sync += r.req.dcbz.eq(0)
 606             sync += r.req.nc.eq(0)
 607             sync += r.req.reserve.eq(0)
 608             sync += r.req.virt_mode.eq(0)
 609             sync += r.req.priv_mode.eq(1)
 610             sync += r.req.addr.eq(m_in.addr)
 611             sync += r.req.data.eq(m_in.pte)
 612             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 613             sync += r.tlbie.eq(m_in.tlbie)
 614             sync += r.doall.eq(m_in.doall)
 615             sync += r.tlbld.eq(m_in.tlbld)
 616             sync += r.mmu_req.eq(1)
 617         with m.Else():
 618             sync += r.req.eq(d_in)
 619             sync += r.tlbie.eq(0)
 620             sync += r.doall.eq(0)
 621             sync += r.tlbld.eq(0)
 622             sync += r.mmu_req.eq(0)
 623             with m.If(~(r1.full & r0_full)):
 624                 sync += r0.eq(r)
 625                 sync += r0_full.eq(r.req.valid)
 626
 627     def tlb_read(self, m, r0_stall, tlb_valid_way,
 628                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 629                  dtlb_tags, dtlb_ptes):
 630         """TLB
 631         Operates in the second cycle on the request latched in r0.req.
 632         TLB updates write the entry at the end of the second cycle.
 633         """
 634         comb = m.d.comb
 635         sync = m.d.sync
 636         m_in, d_in = self.m_in, self.d_in
 637
 638         index    = Signal(TLB_SET_BITS)
 639         addrbits = Signal(TLB_SET_BITS)
 640
 641         amin = TLB_LG_PGSZ
 642         amax = TLB_LG_PGSZ + TLB_SET_BITS
 643
 644         with m.If(m_in.valid):
 645             comb += addrbits.eq(m_in.addr[amin : amax])
 646         with m.Else():
 647             comb += addrbits.eq(d_in.addr[amin : amax])
 648         comb += index.eq(addrbits)
 649
 650         # If we have any op and the previous op isn't finished,
 651         # then keep the same output for next cycle.
 652         with m.If(~r0_stall):
 653             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 654             sync += tlb_tag_way.eq(dtlb_tags[index])
 655             sync += tlb_pte_way.eq(dtlb_ptes[index])
 656
 657     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 658         """Generate TLB PLRUs
 659         """
 660         comb = m.d.comb
 661         sync = m.d.sync
 662
 663         if TLB_NUM_WAYS == 0:
 664             return
 665         for i in range(TLB_SET_SIZE):
 666             # TLB PLRU interface
 667             tlb_plru        = PLRU(TLB_WAY_BITS)
 668             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 669             tlb_plru_acc_en = Signal()
 670
 671             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 672             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 673             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 674             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 675
 676     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 677                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 678                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 679
 680         comb = m.d.comb
 681         sync = m.d.sync
 682
 683         hitway = Signal(TLB_WAY_BITS)
 684         hit    = Signal()
 685         eatag  = Signal(TLB_EA_TAG_BITS)
 686
 687         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 688         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 689         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 690
 691         for i in range(TLB_NUM_WAYS):
 692             is_tag_hit = Signal()
 693             comb += is_tag_hit.eq(tlb_valid_way[i]
 694                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 695             with m.If(is_tag_hit):
 696                 comb += hitway.eq(i)
 697                 comb += hit.eq(1)
 698
 699         comb += tlb_hit.eq(hit & r0_valid)
 700         comb += tlb_hit_way.eq(hitway)
 701
 702         with m.If(tlb_hit):
 703             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 704         with m.Else():
 705             comb += pte.eq(0)
 706         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 707         with m.If(r0.req.virt_mode):
 708             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 709                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 710                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 711             comb += perm_attr.reference.eq(pte[8])
 712             comb += perm_attr.changed.eq(pte[7])
 713             comb += perm_attr.nocache.eq(pte[5])
 714             comb += perm_attr.priv.eq(pte[3])
 715             comb += perm_attr.rd_perm.eq(pte[2])
 716             comb += perm_attr.wr_perm.eq(pte[1])
 717         with m.Else():
 718             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 719                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 720
 721             comb += perm_attr.reference.eq(1)
 722             comb += perm_attr.changed.eq(1)
 723             comb += perm_attr.nocache.eq(0)
 724             comb += perm_attr.priv.eq(1)
 725             comb += perm_attr.rd_perm.eq(1)
 726             comb += perm_attr.wr_perm.eq(1)
 727
 728     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 729                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 730                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 731
 732         comb = m.d.comb
 733         sync = m.d.sync
 734
 735         tlbie    = Signal()
 736         tlbwe    = Signal()
 737
 738         comb += tlbie.eq(r0_valid & r0.tlbie)
 739         comb += tlbwe.eq(r0_valid & r0.tlbld)
 740
 741         m.submodules.tlb_update = d = DTLBUpdate()
 742         with m.If(tlbie & r0.doall):
 743             # clear all valid bits at once
 744             for i in range(TLB_SET_SIZE):
 745                 sync += dtlb_valid_bits[i].eq(0)
 746         with m.If(d.updated):
 747             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 748             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 749         with m.If(d.v_updated):
 750             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 751
 752         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 753
 754         comb += d.tlbie.eq(tlbie)
 755         comb += d.tlbwe.eq(tlbwe)
 756         comb += d.doall.eq(r0.doall)
 757         comb += d.tlb_hit.eq(tlb_hit)
 758         comb += d.tlb_hit_way.eq(tlb_hit_way)
 759         comb += d.tlb_tag_way.eq(tlb_tag_way)
 760         comb += d.tlb_pte_way.eq(tlb_pte_way)
 761         comb += d.tlb_req_index.eq(tlb_req_index)
 762
 763         with m.If(tlb_hit):
 764             comb += d.repl_way.eq(tlb_hit_way)
 765         with m.Else():
 766             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 767         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 768         comb += d.pte_data.eq(r0.req.data)
 769
 770     def maybe_plrus(self, m, r1, plru_victim):
 771         """Generate PLRUs
 772         """
 773         comb = m.d.comb
 774         sync = m.d.sync
 775
 776         if TLB_NUM_WAYS == 0:
 777             return
 778
 779         for i in range(NUM_LINES):
 780             # PLRU interface
 781             plru        = PLRU(WAY_BITS)
 782             setattr(m.submodules, "plru%d" % i, plru)
 783             plru_acc_en = Signal()
 784
 785             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 786             comb += plru.acc_en.eq(plru_acc_en)
 787             comb += plru.acc_i.eq(r1.hit_way)
 788             comb += plru_victim[i].eq(plru.lru_o)
 789
 790     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 791         """Cache tag RAM read port
 792         """
 793         comb = m.d.comb
 794         sync = m.d.sync
 795         m_in, d_in = self.m_in, self.d_in
 796
 797         index = Signal(INDEX_BITS)
 798
 799         with m.If(r0_stall):
 800             comb += index.eq(req_index)
 801         with m.Elif(m_in.valid):
 802             comb += index.eq(get_index(m_in.addr))
 803         with m.Else():
 804             comb += index.eq(get_index(d_in.addr))
 805         sync += cache_tag_set.eq(cache_tags[index])
 806
 807     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 808                        r0_valid, r1, cache_valids, replace_way,
 809                        use_forward1_next, use_forward2_next,
 810                        req_hit_way, plru_victim, rc_ok, perm_attr,
 811                        valid_ra, perm_ok, access_ok, req_op, req_go,
 812                        tlb_pte_way,
 813                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 814                        cancel_store, req_same_tag, r0_stall, early_req_row):
 815         """Cache request parsing and hit detection
 816         """
 817
 818         comb = m.d.comb
 819         sync = m.d.sync
 820         m_in, d_in = self.m_in, self.d_in
 821
 822         is_hit      = Signal()
 823         hit_way     = Signal(WAY_BITS)
 824         op          = Signal(Op)
 825         opsel       = Signal(3)
 826         go          = Signal()
 827         nc          = Signal()
 828         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 829                                   for i in range(TLB_NUM_WAYS))
 830         cache_valid_idx = Signal(NUM_WAYS)
 831
 832         # Extract line, row and tag from request
 833         comb += req_index.eq(get_index(r0.req.addr))
 834         comb += req_row.eq(get_row(r0.req.addr))
 835         comb += req_tag.eq(get_tag(ra))
 836
 837         if False: # display on comb is a bit... busy.
 838             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 839                     r0.req.addr, ra, req_index, req_tag, req_row)
 840
 841         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 842         comb += cache_valid_idx.eq(cache_valids[req_index])
 843
 844         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 845                                 tlb_valid_way, tlb_hit_way,
 846                                 cache_valid_idx, cache_tag_set,
 847                                 r0.req.addr,
 848                                 hit_set)
 849
 850         comb += dc.tlb_hit.eq(tlb_hit)
 851         comb += dc.reload_tag.eq(r1.reload_tag)
 852         comb += dc.virt_mode.eq(r0.req.virt_mode)
 853         comb += dc.go.eq(go)
 854         comb += dc.req_index.eq(req_index)
 855         comb += is_hit.eq(dc.is_hit)
 856         comb += hit_way.eq(dc.hit_way)
 857         comb += req_same_tag.eq(dc.rel_match)
 858
 859         # See if the request matches the line currently being reloaded
 860         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 861                   (req_index == r1.store_index) & req_same_tag):
 862             # For a store, consider this a hit even if the row isn't
 863             # valid since it will be by the time we perform the store.
 864             # For a load, check the appropriate row valid bit.
 865             rrow = Signal(ROW_LINE_BITS)
 866             comb += rrow.eq(req_row)
 867             valid = r1.rows_valid[rrow]
 868             comb += is_hit.eq(~r0.req.load | valid)
 869             comb += hit_way.eq(replace_way)
 870
 871         # Whether to use forwarded data for a load or not
 872         with m.If((get_row(r1.req.real_addr) == req_row) &
 873                   (r1.req.hit_way == hit_way)):
 874             # Only need to consider r1.write_bram here, since if we
 875             # are writing refill data here, then we don't have a
 876             # cache hit this cycle on the line being refilled.
 877             # (There is the possibility that the load following the
 878             # load miss that started the refill could be to the old
 879             # contents of the victim line, since it is a couple of
 880             # cycles after the refill starts before we see the updated
 881             # cache tag. In that case we don't use the bypass.)
 882             comb += use_forward1_next.eq(r1.write_bram)
 883         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 884             comb += use_forward2_next.eq(r1.forward_valid1)
 885
 886         # The way that matched on a hit
 887         comb += req_hit_way.eq(hit_way)
 888
 889         # The way to replace on a miss
 890         with m.If(r1.write_tag):
 891             comb += replace_way.eq(plru_victim[r1.store_index])
 892         with m.Else():
 893             comb += replace_way.eq(r1.store_way)
 894
 895         # work out whether we have permission for this access
 896         # NB we don't yet implement AMR, thus no KUAP
 897         comb += rc_ok.eq(perm_attr.reference
 898                          & (r0.req.load | perm_attr.changed)
 899                 )
 900         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 901                            (perm_attr.wr_perm |
 902                               (r0.req.load & perm_attr.rd_perm)))
 903         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 904         # Combine the request and cache hit status to decide what
 905         # operation needs to be done
 906         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 907         comb += op.eq(Op.OP_NONE)
 908         with m.If(go):
 909             with m.If(~access_ok):
 910                 comb += op.eq(Op.OP_BAD)
 911             with m.Elif(cancel_store):
 912                 comb += op.eq(Op.OP_STCX_FAIL)
 913             with m.Else():
 914                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 915                 with m.Switch(opsel):
 916                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 917                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 918                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 919                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 920                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 921                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 922                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 923                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 924         comb += req_op.eq(op)
 925         comb += req_go.eq(go)
 926
 927         # Version of the row number that is valid one cycle earlier
 928         # in the cases where we need to read the cache data BRAM.
 929         # If we're stalling then we need to keep reading the last
 930         # row requested.
 931         with m.If(~r0_stall):
 932             with m.If(m_in.valid):
 933                 comb += early_req_row.eq(get_row(m_in.addr))
 934             with m.Else():
 935                 comb += early_req_row.eq(get_row(d_in.addr))
 936         with m.Else():
 937             comb += early_req_row.eq(req_row)
 938
 939     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 940                          r0_valid, r0, reservation):
 941         """Handle load-with-reservation and store-conditional instructions
 942         """
 943         comb = m.d.comb
 944         sync = m.d.sync
 945
 946         with m.If(r0_valid & r0.req.reserve):
 947             # XXX generate alignment interrupt if address
 948             # is not aligned XXX or if r0.req.nc = '1'
 949             with m.If(r0.req.load):
 950                 comb += set_rsrv.eq(1) # load with reservation
 951             with m.Else():
 952                 comb += clear_rsrv.eq(1) # store conditional
 953                 with m.If(~reservation.valid |
 954                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 955                     comb += cancel_store.eq(1)
 956
 957     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 958                         reservation, r0):
 959
 960         comb = m.d.comb
 961         sync = m.d.sync
 962
 963         with m.If(r0_valid & access_ok):
 964             with m.If(clear_rsrv):
 965                 sync += reservation.valid.eq(0)
 966             with m.Elif(set_rsrv):
 967                 sync += reservation.valid.eq(1)
 968                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 969
 970     def writeback_control(self, m, r1, cache_out_row):
 971         """Return data for loads & completion control logic
 972         """
 973         comb = m.d.comb
 974         sync = m.d.sync
 975         d_out, m_out = self.d_out, self.m_out
 976
 977         data_out = Signal(64)
 978         data_fwd = Signal(64)
 979
 980         # Use the bypass if are reading the row that was
 981         # written 1 or 2 cycles ago, including for the
 982         # slow_valid = 1 case (i.e. completing a load
 983         # miss or a non-cacheable load).
 984         with m.If(r1.use_forward1):
 985             comb += data_fwd.eq(r1.forward_data1)
 986         with m.Else():
 987             comb += data_fwd.eq(r1.forward_data2)
 988
 989         comb += data_out.eq(cache_out_row)
 990
 991         for i in range(8):
 992             with m.If(r1.forward_sel[i]):
 993                 dsel = data_fwd.word_select(i, 8)
 994                 comb += data_out.word_select(i, 8).eq(dsel)
 995
 996         comb += d_out.valid.eq(r1.ls_valid)
 997         comb += d_out.data.eq(data_out)
 998         comb += d_out.store_done.eq(~r1.stcx_fail)
 999         comb += d_out.error.eq(r1.ls_error)
1000         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1001
1002         # Outputs to MMU
1003         comb += m_out.done.eq(r1.mmu_done)
1004         comb += m_out.err.eq(r1.mmu_error)
1005         comb += m_out.data.eq(data_out)
1006
1007         # We have a valid load or store hit or we just completed
1008         # a slow op such as a load miss, a NC load or a store
1009         #
1010         # Note: the load hit is delayed by one cycle. However it
1011         # can still not collide with r.slow_valid (well unless I
1012         # miscalculated) because slow_valid can only be set on a
1013         # subsequent request and not on its first cycle (the state
1014         # machine must have advanced), which makes slow_valid
1015         # at least 2 cycles from the previous hit_load_valid.
1016
1017         # Sanity: Only one of these must be set in any given cycle
1018
1019         if False: # TODO: need Display to get this to work
1020             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1021             "unexpected slow_valid collision with stcx_fail"
1022
1023             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1024              "unexpected hit_load_delayed collision with slow_valid"
1025
1026         with m.If(~r1.mmu_req):
1027             # Request came from loadstore1...
1028             # Load hit case is the standard path
1029             with m.If(r1.hit_load_valid):
1030                 sync += Display("completing load hit data=%x", data_out)
1031
1032             # error cases complete without stalling
1033             with m.If(r1.ls_error):
1034                 sync += Display("completing ld/st with error")
1035
1036             # Slow ops (load miss, NC, stores)
1037             with m.If(r1.slow_valid):
1038                 sync += Display("completing store or load miss data=%x",
1039                                 data_out)
1040
1041         with m.Else():
1042             # Request came from MMU
1043             with m.If(r1.hit_load_valid):
1044                 sync += Display("completing load hit to MMU, data=%x",
1045                                 m_out.data)
1046             # error cases complete without stalling
1047             with m.If(r1.mmu_error):
1048                 sync += Display("combpleting MMU ld with error")
1049
1050             # Slow ops (i.e. load miss)
1051             with m.If(r1.slow_valid):
1052                 sync += Display("completing MMU load miss, data=%x",
1053                                 m_out.data)
1054
1055     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1056         """rams
1057         Generate a cache RAM for each way. This handles the normal
1058         reads, writes from reloads and the special store-hit update
1059         path as well.
1060
1061         Note: the BRAMs have an extra read buffer, meaning the output
1062         is pipelined an extra cycle. This differs from the
1063         icache. The writeback logic needs to take that into
1064         account by using 1-cycle delayed signals for load hits.
1065         """
1066         comb = m.d.comb
1067         wb_in = self.wb_in
1068
1069         for i in range(NUM_WAYS):
1070             do_read  = Signal(name="do_rd%d" % i)
1071             rd_addr  = Signal(ROW_BITS)
1072             do_write = Signal(name="do_wr%d" % i)
1073             wr_addr  = Signal(ROW_BITS)
1074             wr_data  = Signal(WB_DATA_BITS)
1075             wr_sel   = Signal(ROW_SIZE)
1076             wr_sel_m = Signal(ROW_SIZE)
1077             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
1078
1079             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
1080             setattr(m.submodules, "cacheram_%d" % i, way)
1081
1082             comb += way.rd_en.eq(do_read)
1083             comb += way.rd_addr.eq(rd_addr)
1084             comb += _d_out.eq(way.rd_data_o)
1085             comb += way.wr_sel.eq(wr_sel_m)
1086             comb += way.wr_addr.eq(wr_addr)
1087             comb += way.wr_data.eq(wr_data)
1088
1089             # Cache hit reads
1090             comb += do_read.eq(1)
1091             comb += rd_addr.eq(early_req_row[:ROW_BITS])
1092             with m.If(r1.hit_way == i):
1093                 comb += cache_out_row.eq(_d_out)
1094
1095             # Write mux:
1096             #
1097             # Defaults to wishbone read responses (cache refill)
1098             #
1099             # For timing, the mux on wr_data/sel/addr is not
1100             # dependent on anything other than the current state.
1101
1102             with m.If(r1.write_bram):
1103                 # Write store data to BRAM.  This happens one
1104                 # cycle after the store is in r0.
1105                 comb += wr_data.eq(r1.req.data)
1106                 comb += wr_sel.eq(r1.req.byte_sel)
1107                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1108
1109                 with m.If(i == r1.req.hit_way):
1110                     comb += do_write.eq(1)
1111             with m.Else():
1112                 # Otherwise, we might be doing a reload or a DCBZ
1113                 with m.If(r1.dcbz):
1114                     comb += wr_data.eq(0)
1115                 with m.Else():
1116                     comb += wr_data.eq(wb_in.dat)
1117                 comb += wr_addr.eq(r1.store_row)
1118                 comb += wr_sel.eq(~0) # all 1s
1119
1120             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1121                       & wb_in.ack & (replace_way == i)):
1122                 comb += do_write.eq(1)
1123
1124             # Mask write selects with do_write since BRAM
1125             # doesn't have a global write-enable
1126             with m.If(do_write):
1127                 comb += wr_sel_m.eq(wr_sel)
1128
1129     # Cache hit synchronous machine for the easy case.
1130     # This handles load hits.
1131     # It also handles error cases (TLB miss, cache paradox)
1132     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1133                         req_hit_way, req_index, req_tag, access_ok,
1134                         tlb_hit, tlb_hit_way, tlb_req_index):
1135
1136         comb = m.d.comb
1137         sync = m.d.sync
1138
1139         with m.If(req_op != Op.OP_NONE):
1140             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1141                     req_op, r0.req.addr, r0.req.nc,
1142                     req_index, req_tag, req_hit_way)
1143
1144         with m.If(r0_valid):
1145             sync += r1.mmu_req.eq(r0.mmu_req)
1146
1147         # Fast path for load/store hits.
1148         # Set signals for the writeback controls.
1149         sync += r1.hit_way.eq(req_hit_way)
1150         sync += r1.hit_index.eq(req_index)
1151
1152         with m.If(req_op == Op.OP_LOAD_HIT):
1153             sync += r1.hit_load_valid.eq(1)
1154         with m.Else():
1155             sync += r1.hit_load_valid.eq(0)
1156
1157         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1158             sync += r1.cache_hit.eq(1)
1159         with m.Else():
1160             sync += r1.cache_hit.eq(0)
1161
1162         with m.If(req_op == Op.OP_BAD):
1163             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1164             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1165             sync += r1.ls_error.eq(~r0.mmu_req)
1166             sync += r1.mmu_error.eq(r0.mmu_req)
1167             sync += r1.cache_paradox.eq(access_ok)
1168
1169             with m.Else():
1170                 sync += r1.ls_error.eq(0)
1171                 sync += r1.mmu_error.eq(0)
1172                 sync += r1.cache_paradox.eq(0)
1173
1174         with m.If(req_op == Op.OP_STCX_FAIL):
1175             r1.stcx_fail.eq(1)
1176         with m.Else():
1177             sync += r1.stcx_fail.eq(0)
1178
1179         # Record TLB hit information for updating TLB PLRU
1180         sync += r1.tlb_hit.eq(tlb_hit)
1181         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1182         sync += r1.tlb_hit_index.eq(tlb_req_index)
1183
1184     # Memory accesses are handled by this state machine:
1185     #
1186     #   * Cache load miss/reload (in conjunction with "rams")
1187     #   * Load hits for non-cachable forms
1188     #   * Stores (the collision case is handled in "rams")
1189     #
1190     # All wishbone requests generation is done here.
1191     # This machine operates at stage 1.
1192     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1193                     cache_valids, r0, replace_way,
1194                     req_hit_way, req_same_tag,
1195                     r0_valid, req_op, cache_tags, req_go, ra):
1196
1197         comb = m.d.comb
1198         sync = m.d.sync
1199         wb_in = self.wb_in
1200
1201         req         = MemAccessRequest("mreq_ds")
1202         acks        = Signal(3)
1203         adjust_acks = Signal(3)
1204
1205         req_row = Signal(ROW_BITS)
1206         req_idx = Signal(INDEX_BITS)
1207         req_tag = Signal(TAG_BITS)
1208         comb += req_idx.eq(get_index(req.real_addr))
1209         comb += req_row.eq(get_row(req.real_addr))
1210         comb += req_tag.eq(get_tag(req.real_addr))
1211
1212         sync += r1.use_forward1.eq(use_forward1_next)
1213         sync += r1.forward_sel.eq(0)
1214
1215         with m.If(use_forward1_next):
1216             sync += r1.forward_sel.eq(r1.req.byte_sel)
1217         with m.Elif(use_forward2_next):
1218             sync += r1.forward_sel.eq(r1.forward_sel1)
1219
1220         sync += r1.forward_data2.eq(r1.forward_data1)
1221         with m.If(r1.write_bram):
1222             sync += r1.forward_data1.eq(r1.req.data)
1223             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1224             sync += r1.forward_way1.eq(r1.req.hit_way)
1225             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1226             sync += r1.forward_valid1.eq(1)
1227         with m.Else():
1228             with m.If(r1.dcbz):
1229                 sync += r1.forward_data1.eq(0)
1230             with m.Else():
1231                 sync += r1.forward_data1.eq(wb_in.dat)
1232             sync += r1.forward_sel1.eq(~0) # all 1s
1233             sync += r1.forward_way1.eq(replace_way)
1234             sync += r1.forward_row1.eq(r1.store_row)
1235             sync += r1.forward_valid1.eq(0)
1236
1237         # One cycle pulses reset
1238         sync += r1.slow_valid.eq(0)
1239         sync += r1.write_bram.eq(0)
1240         sync += r1.inc_acks.eq(0)
1241         sync += r1.dec_acks.eq(0)
1242
1243         sync += r1.ls_valid.eq(0)
1244         # complete tlbies and TLB loads in the third cycle
1245         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1246
1247         with m.If((req_op == Op.OP_LOAD_HIT)
1248                   | (req_op == Op.OP_STCX_FAIL)):
1249             with m.If(~r0.mmu_req):
1250                 sync += r1.ls_valid.eq(1)
1251             with m.Else():
1252                 sync += r1.mmu_done.eq(1)
1253
1254         with m.If(r1.write_tag):
1255             # Store new tag in selected way
1256             for i in range(NUM_WAYS):
1257                 with m.If(i == replace_way):
1258                     ct = Signal(TAG_RAM_WIDTH)
1259                     comb += ct.eq(cache_tags[r1.store_index])
1260                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1261                     sync += cache_tags[r1.store_index].eq(ct)
1262             sync += r1.store_way.eq(replace_way)
1263             sync += r1.write_tag.eq(0)
1264
1265         # Take request from r1.req if there is one there,
1266         # else from req_op, ra, etc.
1267         with m.If(r1.full):
1268             comb += req.eq(r1.req)
1269         with m.Else():
1270             comb += req.op.eq(req_op)
1271             comb += req.valid.eq(req_go)
1272             comb += req.mmu_req.eq(r0.mmu_req)
1273             comb += req.dcbz.eq(r0.req.dcbz)
1274             comb += req.real_addr.eq(ra)
1275
1276             with m.If(~r0.req.dcbz):
1277                 comb += req.data.eq(r0.req.data)
1278             with m.Else():
1279                 comb += req.data.eq(0)
1280
1281             # Select all bytes for dcbz
1282             # and for cacheable loads
1283             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1284                 comb += req.byte_sel.eq(~0) # all 1s
1285             with m.Else():
1286                 comb += req.byte_sel.eq(r0.req.byte_sel)
1287             comb += req.hit_way.eq(req_hit_way)
1288             comb += req.same_tag.eq(req_same_tag)
1289
1290             # Store the incoming request from r0,
1291             # if it is a slow request
1292             # Note that r1.full = 1 implies req_op = OP_NONE
1293             with m.If((req_op == Op.OP_LOAD_MISS)
1294                       | (req_op == Op.OP_LOAD_NC)
1295                       | (req_op == Op.OP_STORE_MISS)
1296                       | (req_op == Op.OP_STORE_HIT)):
1297                 sync += r1.req.eq(req)
1298                 sync += r1.full.eq(1)
1299
1300         # Main state machine
1301         with m.Switch(r1.state):
1302
1303             with m.Case(State.IDLE):
1304                 sync += r1.real_adr.eq(req.real_addr)
1305                 sync += r1.wb.sel.eq(req.byte_sel)
1306                 sync += r1.wb.dat.eq(req.data)
1307                 sync += r1.dcbz.eq(req.dcbz)
1308
1309                 # Keep track of our index and way
1310                 # for subsequent stores.
1311                 sync += r1.store_index.eq(req_idx)
1312                 sync += r1.store_row.eq(req_row)
1313                 sync += r1.end_row_ix.eq(get_row_of_line(req_row))
1314                 sync += r1.reload_tag.eq(req_tag)
1315                 sync += r1.req.same_tag.eq(1)
1316
1317                 with m.If(req.op == Op.OP_STORE_HIT):
1318                     sync += r1.store_way.eq(req.hit_way)
1319
1320                 # Reset per-row valid bits,
1321                 # ready for handling OP_LOAD_MISS
1322                 for i in range(ROW_PER_LINE):
1323                     sync += r1.rows_valid[i].eq(0)
1324
1325                 with m.If(req_op != Op.OP_NONE):
1326                     sync += Display("cache op %d", req.op)
1327
1328                 with m.Switch(req.op):
1329                     with m.Case(Op.OP_LOAD_HIT):
1330                         # stay in IDLE state
1331                         pass
1332
1333                     with m.Case(Op.OP_LOAD_MISS):
1334                         sync += Display("cache miss real addr: %x " \
1335                                 "idx: %x tag: %x",
1336                                 req.real_addr, req_row, req_tag)
1337
1338                         # Start the wishbone cycle
1339                         sync += r1.wb.we.eq(0)
1340                         sync += r1.wb.cyc.eq(1)
1341                         sync += r1.wb.stb.eq(1)
1342
1343                         # Track that we had one request sent
1344                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1345                         sync += r1.write_tag.eq(1)
1346
1347                     with m.Case(Op.OP_LOAD_NC):
1348                         sync += r1.wb.cyc.eq(1)
1349                         sync += r1.wb.stb.eq(1)
1350                         sync += r1.wb.we.eq(0)
1351                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1352
1353                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1354                         with m.If(~req.dcbz):
1355                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1356                             sync += r1.acks_pending.eq(1)
1357                             sync += r1.full.eq(0)
1358                             sync += r1.slow_valid.eq(1)
1359
1360                             with m.If(~req.mmu_req):
1361                                 sync += r1.ls_valid.eq(1)
1362                             with m.Else():
1363                                 sync += r1.mmu_done.eq(1)
1364
1365                             with m.If(req.op == Op.OP_STORE_HIT):
1366                                 sync += r1.write_bram.eq(1)
1367                         with m.Else():
1368                             # dcbz is handled much like a load miss except
1369                             # that we are writing to memory instead of reading
1370                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1371
1372                             with m.If(req.op == Op.OP_STORE_MISS):
1373                                 sync += r1.write_tag.eq(1)
1374
1375                         sync += r1.wb.we.eq(1)
1376                         sync += r1.wb.cyc.eq(1)
1377                         sync += r1.wb.stb.eq(1)
1378
1379                     # OP_NONE and OP_BAD do nothing
1380                     # OP_BAD & OP_STCX_FAIL were
1381                     # handled above already
1382                     with m.Case(Op.OP_NONE):
1383                         pass
1384                     with m.Case(Op.OP_BAD):
1385                         pass
1386                     with m.Case(Op.OP_STCX_FAIL):
1387                         pass
1388
1389             with m.Case(State.RELOAD_WAIT_ACK):
1390                 ld_stbs_done = Signal()
1391                 # Requests are all sent if stb is 0
1392                 comb += ld_stbs_done.eq(~r1.wb.stb)
1393
1394                 with m.If((~wb_in.stall) & r1.wb.stb):
1395                     # That was the last word?
1396                     # We are done sending.
1397                     # Clear stb and set ld_stbs_done
1398                     # so we can handle an eventual
1399                     # last ack on the same cycle.
1400                     with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
1401                         sync += r1.wb.stb.eq(0)
1402                         comb += ld_stbs_done.eq(1)
1403
1404                     # Calculate the next row address in the current cache line
1405                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1406                     comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
1407                     sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
1408
1409                 # Incoming acks processing
1410                 sync += r1.forward_valid1.eq(wb_in.ack)
1411                 with m.If(wb_in.ack):
1412                     srow = Signal(ROW_LINE_BITS)
1413                     comb += srow.eq(r1.store_row)
1414                     sync += r1.rows_valid[srow].eq(1)
1415
1416                     # If this is the data we were looking for,
1417                     # we can complete the request next cycle.
1418                     # Compare the whole address in case the
1419                     # request in r1.req is not the one that
1420                     # started this refill.
1421                     with m.If(r1.full & r1.req.same_tag &
1422                               ((r1.dcbz & r1.req.dcbz) |
1423                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1424                                 (r1.store_row == get_row(r1.req.real_addr))):
1425                         sync += r1.full.eq(0)
1426                         sync += r1.slow_valid.eq(1)
1427                         with m.If(~r1.mmu_req):
1428                             sync += r1.ls_valid.eq(1)
1429                         with m.Else():
1430                             sync += r1.mmu_done.eq(1)
1431                         sync += r1.forward_sel.eq(~0) # all 1s
1432                         sync += r1.use_forward1.eq(1)
1433
1434                     # Check for completion
1435                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1436                                                       r1.end_row_ix)):
1437                         # Complete wishbone cycle
1438                         sync += r1.wb.cyc.eq(0)
1439
1440                         # Cache line is now valid
1441                         cv = Signal(INDEX_BITS)
1442                         comb += cv.eq(cache_valids[r1.store_index])
1443                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1444                         sync += cache_valids[r1.store_index].eq(cv)
1445                         sync += r1.state.eq(State.IDLE)
1446
1447                     # Increment store row counter
1448                     sync += r1.store_row.eq(next_row(r1.store_row))
1449
1450             with m.Case(State.STORE_WAIT_ACK):
1451                 st_stbs_done = Signal()
1452                 comb += st_stbs_done.eq(~r1.wb.stb)
1453                 comb += acks.eq(r1.acks_pending)
1454
1455                 with m.If(r1.inc_acks != r1.dec_acks):
1456                     with m.If(r1.inc_acks):
1457                         comb += adjust_acks.eq(acks + 1)
1458                     with m.Else():
1459                         comb += adjust_acks.eq(acks - 1)
1460                 with m.Else():
1461                     comb += adjust_acks.eq(acks)
1462
1463                 sync += r1.acks_pending.eq(adjust_acks)
1464
1465                 # Clear stb when slave accepted request
1466                 with m.If(~wb_in.stall):
1467                     # See if there is another store waiting
1468                     # to be done which is in the same real page.
1469                     with m.If(req.valid):
1470                         ra = req.real_addr[0:SET_SIZE_BITS]
1471                         sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
1472                         sync += r1.wb.dat.eq(req.data)
1473                         sync += r1.wb.sel.eq(req.byte_sel)
1474
1475                     with m.Elif((adjust_acks < 7) & req.same_tag &
1476                                 ((req.op == Op.OP_STORE_MISS)
1477                                  | (req.op == Op.OP_STORE_HIT))):
1478                         sync += r1.wb.stb.eq(1)
1479                         comb += st_stbs_done.eq(0)
1480
1481                         with m.If(req.op == Op.OP_STORE_HIT):
1482                             sync += r1.write_bram.eq(1)
1483                         sync += r1.full.eq(0)
1484                         sync += r1.slow_valid.eq(1)
1485
1486                         # Store requests never come from the MMU
1487                         sync += r1.ls_valid.eq(1)
1488                         comb += st_stbs_done.eq(0)
1489                         sync += r1.inc_acks.eq(1)
1490                     with m.Else():
1491                         sync += r1.wb.stb.eq(0)
1492                         comb += st_stbs_done.eq(1)
1493
1494                 # Got ack ? See if complete.
1495                 with m.If(wb_in.ack):
1496                     with m.If(st_stbs_done & (adjust_acks == 1)):
1497                         sync += r1.state.eq(State.IDLE)
1498                         sync += r1.wb.cyc.eq(0)
1499                         sync += r1.wb.stb.eq(0)
1500                     sync += r1.dec_acks.eq(1)
1501
1502             with m.Case(State.NC_LOAD_WAIT_ACK):
1503                 # Clear stb when slave accepted request
1504                 with m.If(~wb_in.stall):
1505                     sync += r1.wb.stb.eq(0)
1506
1507                 # Got ack ? complete.
1508                 with m.If(wb_in.ack):
1509                     sync += r1.state.eq(State.IDLE)
1510                     sync += r1.full.eq(0)
1511                     sync += r1.slow_valid.eq(1)
1512
1513                     with m.If(~r1.mmu_req):
1514                         sync += r1.ls_valid.eq(1)
1515                     with m.Else():
1516                         sync += r1.mmu_done.eq(1)
1517
1518                     sync += r1.forward_sel.eq(~0) # all 1s
1519                     sync += r1.use_forward1.eq(1)
1520                     sync += r1.wb.cyc.eq(0)
1521                     sync += r1.wb.stb.eq(0)
1522
1523     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1524
1525         sync = m.d.sync
1526         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1527
1528         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1529                                stall_out, req_op[:3], d_out.valid, d_out.error,
1530                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1531                                r1.real_adr[3:6]))
1532
1533     def elaborate(self, platform):
1534
1535         m = Module()
1536         comb = m.d.comb
1537
1538         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1539         cache_tags       = CacheTagArray()
1540         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1541         cache_valids = CacheValidBitsArray()
1542
1543         # TODO attribute ram_style : string;
1544         # TODO attribute ram_style of cache_tags : signal is "distributed";
1545
1546         """note: these are passed to nmigen.hdl.Memory as "attributes".
1547            don't know how, just that they are.
1548         """
1549         dtlb_valid_bits = TLBValidBitsArray()
1550         dtlb_tags       = TLBTagsArray()
1551         dtlb_ptes       = TLBPtesArray()
1552         # TODO attribute ram_style of
1553         #  dtlb_tags : signal is "distributed";
1554         # TODO attribute ram_style of
1555         #  dtlb_ptes : signal is "distributed";
1556
1557         r0      = RegStage0("r0")
1558         r0_full = Signal()
1559
1560         r1 = RegStage1("r1")
1561
1562         reservation = Reservation()
1563
1564         # Async signals on incoming request
1565         req_index    = Signal(INDEX_BITS)
1566         req_row      = Signal(ROW_BITS)
1567         req_hit_way  = Signal(WAY_BITS)
1568         req_tag      = Signal(TAG_BITS)
1569         req_op       = Signal(Op)
1570         req_data     = Signal(64)
1571         req_same_tag = Signal()
1572         req_go       = Signal()
1573
1574         early_req_row     = Signal(ROW_BITS)
1575
1576         cancel_store      = Signal()
1577         set_rsrv          = Signal()
1578         clear_rsrv        = Signal()
1579
1580         r0_valid          = Signal()
1581         r0_stall          = Signal()
1582
1583         use_forward1_next = Signal()
1584         use_forward2_next = Signal()
1585
1586         cache_out_row     = Signal(WB_DATA_BITS)
1587
1588         plru_victim       = PLRUOut()
1589         replace_way       = Signal(WAY_BITS)
1590
1591         # Wishbone read/write/cache write formatting signals
1592         bus_sel           = Signal(8)
1593
1594         # TLB signals
1595         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1596         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1597         tlb_valid_way = Signal(TLB_NUM_WAYS)
1598         tlb_req_index = Signal(TLB_SET_BITS)
1599         tlb_hit       = Signal()
1600         tlb_hit_way   = Signal(TLB_WAY_BITS)
1601         pte           = Signal(TLB_PTE_BITS)
1602         ra            = Signal(REAL_ADDR_BITS)
1603         valid_ra      = Signal()
1604         perm_attr     = PermAttr("dc_perms")
1605         rc_ok         = Signal()
1606         perm_ok       = Signal()
1607         access_ok     = Signal()
1608
1609         tlb_plru_victim = TLBPLRUOut()
1610
1611         # we don't yet handle collisions between loadstore1 requests
1612         # and MMU requests
1613         comb += self.m_out.stall.eq(0)
1614
1615         # Hold off the request in r0 when r1 has an uncompleted request
1616         comb += r0_stall.eq(r0_full & r1.full)
1617         comb += r0_valid.eq(r0_full & ~r1.full)
1618         comb += self.stall_out.eq(r0_stall)
1619
1620         # Wire up wishbone request latch out of stage 1
1621         comb += r1.wb.adr.eq(r1.real_adr[ROW_OFF_BITS:]) # truncate LSBs
1622         comb += self.wb_out.eq(r1.wb)
1623
1624         # call sub-functions putting everything together, using shared
1625         # signals established above
1626         self.stage_0(m, r0, r1, r0_full)
1627         self.tlb_read(m, r0_stall, tlb_valid_way,
1628                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1629                       dtlb_tags, dtlb_ptes)
1630         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1631                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1632                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1633         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1634                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1635                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1636         self.maybe_plrus(m, r1, plru_victim)
1637         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1638         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1639         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1640                            r0_valid, r1, cache_valids, replace_way,
1641                            use_forward1_next, use_forward2_next,
1642                            req_hit_way, plru_victim, rc_ok, perm_attr,
1643                            valid_ra, perm_ok, access_ok, req_op, req_go,
1644                            tlb_pte_way,
1645                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1646                            cancel_store, req_same_tag, r0_stall, early_req_row)
1647         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1648                            r0_valid, r0, reservation)
1649         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1650                            reservation, r0)
1651         self.writeback_control(m, r1, cache_out_row)
1652         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1653         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1654                         req_hit_way, req_index, req_tag, access_ok,
1655                         tlb_hit, tlb_hit_way, tlb_req_index)
1656         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1657                     cache_valids, r0, replace_way,
1658                     req_hit_way, req_same_tag,
1659                          r0_valid, req_op, cache_tags, req_go, ra)
1660         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1661
1662         return m
1663
1664 def dcache_load(dut, addr, nc=0):
1665     yield dut.d_in.load.eq(1)
1666     yield dut.d_in.nc.eq(nc)
1667     yield dut.d_in.addr.eq(addr)
1668     yield dut.d_in.byte_sel.eq(~0)
1669     yield dut.d_in.valid.eq(1)
1670     yield
1671     yield dut.d_in.valid.eq(0)
1672     yield dut.d_in.byte_sel.eq(0)
1673     yield
1674     while not (yield dut.d_out.valid):
1675         yield
1676     data = yield dut.d_out.data
1677     return data
1678
1679
1680 def dcache_store(dut, addr, data, nc=0):
1681     yield dut.d_in.load.eq(0)
1682     yield dut.d_in.nc.eq(nc)
1683     yield dut.d_in.data.eq(data)
1684     yield dut.d_in.byte_sel.eq(~0)
1685     yield dut.d_in.addr.eq(addr)
1686     yield dut.d_in.valid.eq(1)
1687     yield
1688     yield dut.d_in.valid.eq(0)
1689     yield dut.d_in.byte_sel.eq(0)
1690     yield
1691     while not (yield dut.d_out.valid):
1692         yield
1693
1694
1695 def dcache_random_sim(dut):
1696
1697     # start with stack of zeros
1698     sim_mem = [0] * 512
1699
1700     # clear stuff
1701     yield dut.d_in.valid.eq(0)
1702     yield dut.d_in.load.eq(0)
1703     yield dut.d_in.priv_mode.eq(1)
1704     yield dut.d_in.nc.eq(0)
1705     yield dut.d_in.addr.eq(0)
1706     yield dut.d_in.data.eq(0)
1707     yield dut.m_in.valid.eq(0)
1708     yield dut.m_in.addr.eq(0)
1709     yield dut.m_in.pte.eq(0)
1710     # wait 4 * clk_period
1711     yield
1712     yield
1713     yield
1714     yield
1715
1716     print ()
1717
1718     for i in range(256):
1719         addr = randint(0, 255)
1720         data = randint(0, (1<<64)-1)
1721         sim_mem[addr] = data
1722         addr *= 8
1723
1724         print ("testing %x data %x" % (addr, data))
1725
1726         yield from dcache_load(dut, addr)
1727         yield from dcache_store(dut, addr, data)
1728
1729         addr = randint(0, 255)
1730         sim_data = sim_mem[addr]
1731         addr *= 8
1732
1733         data = yield from dcache_load(dut, addr)
1734         assert data == sim_data, \
1735             "check %x data %x != %x" % (addr, data, sim_data)
1736
1737     for addr in range(256):
1738         data = yield from dcache_load(dut, addr*8)
1739         assert data == sim_mem[addr], \
1740             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1741
1742 def dcache_sim(dut):
1743     # clear stuff
1744     yield dut.d_in.valid.eq(0)
1745     yield dut.d_in.load.eq(0)
1746     yield dut.d_in.priv_mode.eq(1)
1747     yield dut.d_in.nc.eq(0)
1748     yield dut.d_in.addr.eq(0)
1749     yield dut.d_in.data.eq(0)
1750     yield dut.m_in.valid.eq(0)
1751     yield dut.m_in.addr.eq(0)
1752     yield dut.m_in.pte.eq(0)
1753     # wait 4 * clk_period
1754     yield
1755     yield
1756     yield
1757     yield
1758
1759     # Cacheable read of address 4
1760     data = yield from dcache_load(dut, 0x58)
1761     addr = yield dut.d_in.addr
1762     assert data == 0x0000001700000016, \
1763         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1764
1765     # Cacheable read of address 20
1766     data = yield from dcache_load(dut, 0x20)
1767     addr = yield dut.d_in.addr
1768     assert data == 0x0000000900000008, \
1769         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1770
1771     # Cacheable read of address 30
1772     data = yield from dcache_load(dut, 0x530)
1773     addr = yield dut.d_in.addr
1774     assert data == 0x0000014D0000014C, \
1775         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1776
1777     # 2nd Cacheable read of address 30
1778     data = yield from dcache_load(dut, 0x530)
1779     addr = yield dut.d_in.addr
1780     assert data == 0x0000014D0000014C, \
1781         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1782
1783     # Non-cacheable read of address 100
1784     data = yield from dcache_load(dut, 0x100, nc=1)
1785     addr = yield dut.d_in.addr
1786     assert data == 0x0000004100000040, \
1787         f"data @%x=%x expected 0000004100000040" % (addr, data)
1788
1789     # Store at address 530
1790     yield from dcache_store(dut, 0x530, 0x121)
1791
1792     # Store at address 30
1793     yield from dcache_store(dut, 0x530, 0x12345678)
1794
1795     # 3nd Cacheable read of address 530
1796     data = yield from dcache_load(dut, 0x530)
1797     addr = yield dut.d_in.addr
1798     assert data == 0x12345678, \
1799         f"data @%x=%x expected 0x12345678" % (addr, data)
1800
1801     # 4th Cacheable read of address 20
1802     data = yield from dcache_load(dut, 0x20)
1803     addr = yield dut.d_in.addr
1804     assert data == 0x0000000900000008, \
1805         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1806
1807     yield
1808     yield
1809     yield
1810     yield
1811
1812
1813 def test_dcache(mem, test_fn, test_name):
1814     dut = DCache()
1815
1816     memory = Memory(width=64, depth=16*64, init=mem)
1817     sram = SRAM(memory=memory, granularity=8)
1818
1819     m = Module()
1820     m.submodules.dcache = dut
1821     m.submodules.sram = sram
1822
1823     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1824     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1825     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1826     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1827     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1828     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1829
1830     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1831     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1832
1833     # nmigen Simulation
1834     sim = Simulator(m)
1835     sim.add_clock(1e-6)
1836
1837     sim.add_sync_process(wrap(test_fn(dut)))
1838     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1839         sim.run()
1840
1841 if __name__ == '__main__':
1842     dut = DCache()
1843     vl = rtlil.convert(dut, ports=[])
1844     with open("test_dcache.il", "w") as f:
1845         f.write(vl)
1846
1847     mem = []
1848     for i in range(0,512):
1849         mem.append((i*2)| ((i*2+1)<<32))
1850
1851     test_dcache(mem, dcache_sim, "")
1852     test_dcache(None, dcache_random_sim, "random")
1853