src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 from nmutil.util import Display
  11
  12 from random import randint
  13
  14 from nmigen.cli import main
  15 from nmutil.iocontrol import RecordObject
  16 from nmigen.utils import log2_int
  17 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  18                                      DCacheToLoadStore1Type,
  19                                      MMUToDCacheType,
  20                                      DCacheToMMUType)
  21
  22 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  23                                 WBAddrType, WBDataType, WBSelType,
  24                                 WBMasterOut, WBSlaveOut,
  25                                 WBMasterOutVector, WBSlaveOutVector,
  26                                 WBIOMasterOut, WBIOSlaveOut)
  27
  28 from soc.experiment.cache_ram import CacheRam
  29 from soc.experiment.plru import PLRU
  30
  31 # for test
  32 from nmigen_soc.wishbone.sram import SRAM
  33 from nmigen import Memory
  34 from nmigen.cli import rtlil
  35 if True:
  36     from nmigen.back.pysim import Simulator, Delay, Settle
  37 else:
  38     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  39 from nmutil.util import wrap
  40
  41
  42 # TODO: make these parameters of DCache at some point
  43 LINE_SIZE = 64    # Line size in bytes
  44 NUM_LINES = 16    # Number of lines in a set
  45 NUM_WAYS = 4      # Number of ways
  46 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  47 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  48 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  49 LOG_LENGTH = 0    # Non-zero to enable log data collection
  50
  51 # BRAM organisation: We never access more than
  52 #     -- WB_DATA_BITS at a time so to save
  53 #     -- resources we make the array only that wide, and
  54 #     -- use consecutive indices for to make a cache "line"
  55 #     --
  56 #     -- ROW_SIZE is the width in bytes of the BRAM
  57 #     -- (based on WB, so 64-bits)
  58 ROW_SIZE = WB_DATA_BITS // 8;
  59
  60 # ROW_PER_LINE is the number of row (wishbone
  61 # transactions) in a line
  62 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  63
  64 # BRAM_ROWS is the number of rows in BRAM needed
  65 # to represent the full dcache
  66 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  67
  68 print ("ROW_SIZE", ROW_SIZE)
  69 print ("ROW_PER_LINE", ROW_PER_LINE)
  70 print ("BRAM_ROWS", BRAM_ROWS)
  71 print ("NUM_WAYS", NUM_WAYS)
  72
  73 # Bit fields counts in the address
  74
  75 # REAL_ADDR_BITS is the number of real address
  76 # bits that we store
  77 REAL_ADDR_BITS = 56
  78
  79 # ROW_BITS is the number of bits to select a row
  80 ROW_BITS = log2_int(BRAM_ROWS)
  81
  82 # ROW_LINE_BITS is the number of bits to select
  83 # a row within a line
  84 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  85
  86 # LINE_OFF_BITS is the number of bits for
  87 # the offset in a cache line
  88 LINE_OFF_BITS = log2_int(LINE_SIZE)
  89
  90 # ROW_OFF_BITS is the number of bits for
  91 # the offset in a row
  92 ROW_OFF_BITS = log2_int(ROW_SIZE)
  93
  94 # INDEX_BITS is the number if bits to
  95 # select a cache line
  96 INDEX_BITS = log2_int(NUM_LINES)
  97
  98 # SET_SIZE_BITS is the log base 2 of the set size
  99 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 100
 101 # TAG_BITS is the number of bits of
 102 # the tag part of the address
 103 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 104
 105 # TAG_WIDTH is the width in bits of each way of the tag RAM
 106 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 107
 108 # WAY_BITS is the number of bits to select a way
 109 WAY_BITS = log2_int(NUM_WAYS)
 110
 111 # Example of layout for 32 lines of 64 bytes:
 112 layout = """\
 113   ..  tag    |index|  line  |
 114   ..         |   row   |    |
 115   ..         |     |---|    | ROW_LINE_BITS  (3)
 116   ..         |     |--- - --| LINE_OFF_BITS (6)
 117   ..         |         |- --| ROW_OFF_BITS  (3)
 118   ..         |----- ---|    | ROW_BITS      (8)
 119   ..         |-----|        | INDEX_BITS    (5)
 120   .. --------|              | TAG_BITS      (45)
 121 """
 122 print (layout)
 123 print ("Dcache TAG %d IDX %d ROW %d ROFF %d LOFF %d RLB %d" % \
 124             (TAG_BITS, INDEX_BITS, ROW_BITS,
 125              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 126 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 127 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 128 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 129
 130 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 131
 132 def CacheTagArray():
 133     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 134                         for x in range(NUM_LINES))
 135
 136 def CacheValidBitsArray():
 137     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 138                         for x in range(NUM_LINES))
 139
 140 def RowPerLineValidArray():
 141     return Array(Signal(name="rows_valid%d" % x) \
 142                         for x in range(ROW_PER_LINE))
 143
 144 # L1 TLB
 145 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 146 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 147 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 148 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 149 TLB_PTE_BITS     = 64
 150 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 151
 152 def ispow2(x):
 153     return (1<<log2_int(x, False)) == x
 154
 155 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 156 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 157 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 158 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 159 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 160 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 161         "geometry bits don't add up"
 162 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 163         "geometry bits don't add up"
 164 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 165          "geometry bits don't add up"
 166 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 167 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 168
 169
 170 def TLBValidBitsArray():
 171     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 172
 173 def TLBTagEAArray():
 174     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 175
 176 def TLBTagsArray():
 177     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 178
 179 def TLBPtesArray():
 180     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 181
 182 def HitWaySet():
 183     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 184                         for x in range(TLB_NUM_WAYS))
 185
 186 # Cache RAM interface
 187 def CacheRamOut():
 188     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 189                  for x in range(NUM_WAYS))
 190
 191 # PLRU output interface
 192 def PLRUOut():
 193     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 194
 195 # TLB PLRU output interface
 196 def TLBPLRUOut():
 197     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 198
 199 # Helper functions to decode incoming requests
 200 #
 201 # Return the cache line index (tag index) for an address
 202 def get_index(addr):
 203     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 204
 205 # Return the cache row index (data memory) for an address
 206 def get_row(addr):
 207     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 208
 209 # Return the index of a row within a line
 210 def get_row_of_line(row):
 211     return row[:ROW_BITS][:ROW_LINE_BITS]
 212
 213 # Returns whether this is the last row of a line
 214 def is_last_row_addr(addr, last):
 215     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 216
 217 # Returns whether this is the last row of a line
 218 def is_last_row(row, last):
 219     return get_row_of_line(row) == last
 220
 221 # Return the next row in the current cache line. We use a
 222 # dedicated function in order to limit the size of the
 223 # generated adder to be only the bits within a cache line
 224 # (3 bits with default settings)
 225 def next_row(row):
 226     row_v = row[0:ROW_LINE_BITS] + 1
 227     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 228
 229 # Get the tag value from the address
 230 def get_tag(addr):
 231     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 232
 233 # Read a tag from a tag memory row
 234 def read_tag(way, tagset):
 235     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 236
 237 # Read a TLB tag from a TLB tag memory row
 238 def read_tlb_tag(way, tags):
 239     return tags.word_select(way, TLB_EA_TAG_BITS)
 240
 241 # Write a TLB tag to a TLB tag memory row
 242 def write_tlb_tag(way, tags, tag):
 243     return read_tlb_tag(way, tags).eq(tag)
 244
 245 # Read a PTE from a TLB PTE memory row
 246 def read_tlb_pte(way, ptes):
 247     return ptes.word_select(way, TLB_PTE_BITS)
 248
 249 def write_tlb_pte(way, ptes, newpte):
 250     return read_tlb_pte(way, ptes).eq(newpte)
 251
 252
 253 # Record for storing permission, attribute, etc. bits from a PTE
 254 class PermAttr(RecordObject):
 255     def __init__(self, name=None):
 256         super().__init__(name=name)
 257         self.reference = Signal()
 258         self.changed   = Signal()
 259         self.nocache   = Signal()
 260         self.priv      = Signal()
 261         self.rd_perm   = Signal()
 262         self.wr_perm   = Signal()
 263
 264
 265 def extract_perm_attr(pte):
 266     pa = PermAttr()
 267     return pa;
 268
 269
 270 # Type of operation on a "valid" input
 271 @unique
 272 class Op(Enum):
 273     OP_NONE       = 0
 274     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 275     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 276     OP_LOAD_HIT   = 3 # Cache hit on load
 277     OP_LOAD_MISS  = 4 # Load missing cache
 278     OP_LOAD_NC    = 5 # Non-cachable load
 279     OP_STORE_HIT  = 6 # Store hitting cache
 280     OP_STORE_MISS = 7 # Store missing cache
 281
 282
 283 # Cache state machine
 284 @unique
 285 class State(Enum):
 286     IDLE             = 0 # Normal load hit processing
 287     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 288     STORE_WAIT_ACK   = 2 # Store wait ack
 289     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 290
 291
 292 # Dcache operations:
 293 #
 294 # In order to make timing, we use the BRAMs with
 295 # an output buffer, which means that the BRAM
 296 # output is delayed by an extra cycle.
 297 #
 298 # Thus, the dcache has a 2-stage internal pipeline
 299 # for cache hits with no stalls.
 300 #
 301 # All other operations are handled via stalling
 302 # in the first stage.
 303 #
 304 # The second stage can thus complete a hit at the same
 305 # time as the first stage emits a stall for a complex op.
 306 #
 307 # Stage 0 register, basically contains just the latched request
 308
 309 class RegStage0(RecordObject):
 310     def __init__(self, name=None):
 311         super().__init__(name=name)
 312         self.req     = LoadStore1ToDCacheType(name="lsmem")
 313         self.tlbie   = Signal()
 314         self.doall   = Signal()
 315         self.tlbld   = Signal()
 316         self.mmu_req = Signal() # indicates source of request
 317
 318
 319 class MemAccessRequest(RecordObject):
 320     def __init__(self, name=None):
 321         super().__init__(name=name)
 322         self.op        = Signal(Op)
 323         self.valid     = Signal()
 324         self.dcbz      = Signal()
 325         self.real_addr = Signal(REAL_ADDR_BITS)
 326         self.data      = Signal(64)
 327         self.byte_sel  = Signal(8)
 328         self.hit_way   = Signal(WAY_BITS)
 329         self.same_tag  = Signal()
 330         self.mmu_req   = Signal()
 331
 332
 333 # First stage register, contains state for stage 1 of load hits
 334 # and for the state machine used by all other operations
 335 class RegStage1(RecordObject):
 336     def __init__(self, name=None):
 337         super().__init__(name=name)
 338         # Info about the request
 339         self.full             = Signal() # have uncompleted request
 340         self.mmu_req          = Signal() # request is from MMU
 341         self.req              = MemAccessRequest(name="reqmem")
 342
 343         # Cache hit state
 344         self.hit_way          = Signal(WAY_BITS)
 345         self.hit_load_valid   = Signal()
 346         self.hit_index        = Signal(INDEX_BITS)
 347         self.cache_hit        = Signal()
 348
 349         # TLB hit state
 350         self.tlb_hit          = Signal()
 351         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 352         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 353
 354         # 2-stage data buffer for data forwarded from writes to reads
 355         self.forward_data1    = Signal(64)
 356         self.forward_data2    = Signal(64)
 357         self.forward_sel1     = Signal(8)
 358         self.forward_valid1   = Signal()
 359         self.forward_way1     = Signal(WAY_BITS)
 360         self.forward_row1     = Signal(ROW_BITS)
 361         self.use_forward1     = Signal()
 362         self.forward_sel      = Signal(8)
 363
 364         # Cache miss state (reload state machine)
 365         self.state            = Signal(State)
 366         self.dcbz             = Signal()
 367         self.write_bram       = Signal()
 368         self.write_tag        = Signal()
 369         self.slow_valid       = Signal()
 370         self.real_adr         = Signal(REAL_ADDR_BITS)
 371         self.wb               = WBMasterOut("wb")
 372         self.reload_tag       = Signal(TAG_BITS)
 373         self.store_way        = Signal(WAY_BITS)
 374         self.store_row        = Signal(ROW_BITS)
 375         self.store_index      = Signal(INDEX_BITS)
 376         self.end_row_ix       = Signal(ROW_LINE_BITS)
 377         self.rows_valid       = RowPerLineValidArray()
 378         self.acks_pending     = Signal(3)
 379         self.inc_acks         = Signal()
 380         self.dec_acks         = Signal()
 381
 382         # Signals to complete (possibly with error)
 383         self.ls_valid         = Signal()
 384         self.ls_error         = Signal()
 385         self.mmu_done         = Signal()
 386         self.mmu_error        = Signal()
 387         self.cache_paradox    = Signal()
 388
 389         # Signal to complete a failed stcx.
 390         self.stcx_fail        = Signal()
 391
 392
 393 # Reservation information
 394 class Reservation(RecordObject):
 395     def __init__(self):
 396         super().__init__()
 397         self.valid = Signal()
 398         self.addr  = Signal(64-LINE_OFF_BITS)
 399
 400
 401 class DTLBUpdate(Elaboratable):
 402     def __init__(self):
 403         self.tlbie    = Signal()
 404         self.tlbwe    = Signal()
 405         self.doall    = Signal()
 406         self.updated  = Signal()
 407         self.v_updated  = Signal()
 408         self.tlb_hit    = Signal()
 409         self.tlb_req_index = Signal(TLB_SET_BITS)
 410
 411         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 412         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 413         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 414         self.repl_way        = Signal(TLB_WAY_BITS)
 415         self.eatag           = Signal(TLB_EA_TAG_BITS)
 416         self.pte_data        = Signal(TLB_PTE_BITS)
 417
 418         self.dv = Signal(TLB_PTE_WAY_BITS)
 419
 420         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 421         self.pb_out = Signal(TLB_NUM_WAYS)
 422         self.db_out = Signal(TLB_PTE_WAY_BITS)
 423
 424     def elaborate(self, platform):
 425         m = Module()
 426         comb = m.d.comb
 427         sync = m.d.sync
 428
 429         tagset   = Signal(TLB_TAG_WAY_BITS)
 430         pteset   = Signal(TLB_PTE_WAY_BITS)
 431
 432         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 433
 434         with m.If(self.tlbie & self.doall):
 435             pass # clear all back in parent
 436         with m.Elif(self.tlbie):
 437             with m.If(self.tlb_hit):
 438                 comb += db_out.eq(self.dv)
 439                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 440                 comb += self.v_updated.eq(1)
 441
 442         with m.Elif(self.tlbwe):
 443
 444             comb += tagset.eq(self.tlb_tag_way)
 445             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 446             comb += tb_out.eq(tagset)
 447
 448             comb += pteset.eq(self.tlb_pte_way)
 449             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 450             comb += pb_out.eq(pteset)
 451
 452             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 453
 454             comb += self.updated.eq(1)
 455             comb += self.v_updated.eq(1)
 456
 457         return m
 458
 459
 460 class DCachePendingHit(Elaboratable):
 461
 462     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 463                       cache_valid_idx, cache_tag_set,
 464                     req_addr,
 465                     hit_set):
 466
 467         self.go          = Signal()
 468         self.virt_mode   = Signal()
 469         self.is_hit      = Signal()
 470         self.tlb_hit     = Signal()
 471         self.hit_way     = Signal(WAY_BITS)
 472         self.rel_match   = Signal()
 473         self.req_index   = Signal(INDEX_BITS)
 474         self.reload_tag  = Signal(TAG_BITS)
 475
 476         self.tlb_hit_way = tlb_hit_way
 477         self.tlb_pte_way = tlb_pte_way
 478         self.tlb_valid_way = tlb_valid_way
 479         self.cache_valid_idx = cache_valid_idx
 480         self.cache_tag_set = cache_tag_set
 481         self.req_addr = req_addr
 482         self.hit_set = hit_set
 483
 484     def elaborate(self, platform):
 485         m = Module()
 486         comb = m.d.comb
 487         sync = m.d.sync
 488
 489         go = self.go
 490         virt_mode = self.virt_mode
 491         is_hit = self.is_hit
 492         tlb_pte_way = self.tlb_pte_way
 493         tlb_valid_way = self.tlb_valid_way
 494         cache_valid_idx = self.cache_valid_idx
 495         cache_tag_set = self.cache_tag_set
 496         req_addr = self.req_addr
 497         tlb_hit_way = self.tlb_hit_way
 498         tlb_hit = self.tlb_hit
 499         hit_set = self.hit_set
 500         hit_way = self.hit_way
 501         rel_match = self.rel_match
 502         req_index = self.req_index
 503         reload_tag = self.reload_tag
 504
 505         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 506                                     for i in range(TLB_NUM_WAYS))
 507         hit_way_set = HitWaySet()
 508
 509         # Test if pending request is a hit on any way
 510         # In order to make timing in virtual mode,
 511         # when we are using the TLB, we compare each
 512         # way with each of the real addresses from each way of
 513         # the TLB, and then decide later which match to use.
 514
 515         with m.If(virt_mode):
 516             for j in range(TLB_NUM_WAYS):
 517                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 518                 s_hit       = Signal()
 519                 s_pte       = Signal(TLB_PTE_BITS)
 520                 s_ra        = Signal(REAL_ADDR_BITS)
 521                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 522                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 523                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 524                 comb += s_tag.eq(get_tag(s_ra))
 525
 526                 for i in range(NUM_WAYS):
 527                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 528                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 529                                   (read_tag(i, cache_tag_set) == s_tag)
 530                                   & tlb_valid_way[j])
 531                     with m.If(is_tag_hit):
 532                         comb += hit_way_set[j].eq(i)
 533                         comb += s_hit.eq(1)
 534                 comb += hit_set[j].eq(s_hit)
 535                 with m.If(s_tag == reload_tag):
 536                     comb += rel_matches[j].eq(1)
 537             with m.If(tlb_hit):
 538                 comb += is_hit.eq(hit_set[tlb_hit_way])
 539                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 540                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 541         with m.Else():
 542             s_tag       = Signal(TAG_BITS)
 543             comb += s_tag.eq(get_tag(req_addr))
 544             for i in range(NUM_WAYS):
 545                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 546                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 547                           (read_tag(i, cache_tag_set) == s_tag))
 548                 with m.If(is_tag_hit):
 549                     comb += hit_way.eq(i)
 550                     comb += is_hit.eq(1)
 551             with m.If(s_tag == reload_tag):
 552                 comb += rel_match.eq(1)
 553
 554         return m
 555
 556
 557 class DCache(Elaboratable):
 558     """Set associative dcache write-through
 559     TODO (in no specific order):
 560     * See list in icache.vhdl
 561     * Complete load misses on the cycle when WB data comes instead of
 562       at the end of line (this requires dealing with requests coming in
 563       while not idle...)
 564     """
 565     def __init__(self):
 566         self.d_in      = LoadStore1ToDCacheType("d_in")
 567         self.d_out     = DCacheToLoadStore1Type("d_out")
 568
 569         self.m_in      = MMUToDCacheType("m_in")
 570         self.m_out     = DCacheToMMUType("m_out")
 571
 572         self.stall_out = Signal()
 573
 574         self.wb_out    = WBMasterOut()
 575         self.wb_in     = WBSlaveOut()
 576
 577         self.log_out   = Signal(20)
 578
 579     def stage_0(self, m, r0, r1, r0_full):
 580         """Latch the request in r0.req as long as we're not stalling
 581         """
 582         comb = m.d.comb
 583         sync = m.d.sync
 584         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 585
 586         r = RegStage0("stage0")
 587
 588         # TODO, this goes in unit tests and formal proofs
 589         with m.If(d_in.valid & m_in.valid):
 590             sync += Display("request collision loadstore vs MMU")
 591
 592         with m.If(m_in.valid):
 593             sync += r.req.valid.eq(1)
 594             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 595             sync += r.req.dcbz.eq(0)
 596             sync += r.req.nc.eq(0)
 597             sync += r.req.reserve.eq(0)
 598             sync += r.req.virt_mode.eq(0)
 599             sync += r.req.priv_mode.eq(1)
 600             sync += r.req.addr.eq(m_in.addr)
 601             sync += r.req.data.eq(m_in.pte)
 602             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 603             sync += r.tlbie.eq(m_in.tlbie)
 604             sync += r.doall.eq(m_in.doall)
 605             sync += r.tlbld.eq(m_in.tlbld)
 606             sync += r.mmu_req.eq(1)
 607         with m.Else():
 608             sync += r.req.eq(d_in)
 609             sync += r.tlbie.eq(0)
 610             sync += r.doall.eq(0)
 611             sync += r.tlbld.eq(0)
 612             sync += r.mmu_req.eq(0)
 613             with m.If(~(r1.full & r0_full)):
 614                 sync += r0.eq(r)
 615                 sync += r0_full.eq(r.req.valid)
 616
 617     def tlb_read(self, m, r0_stall, tlb_valid_way,
 618                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 619                  dtlb_tags, dtlb_ptes):
 620         """TLB
 621         Operates in the second cycle on the request latched in r0.req.
 622         TLB updates write the entry at the end of the second cycle.
 623         """
 624         comb = m.d.comb
 625         sync = m.d.sync
 626         m_in, d_in = self.m_in, self.d_in
 627
 628         index    = Signal(TLB_SET_BITS)
 629         addrbits = Signal(TLB_SET_BITS)
 630
 631         amin = TLB_LG_PGSZ
 632         amax = TLB_LG_PGSZ + TLB_SET_BITS
 633
 634         with m.If(m_in.valid):
 635             comb += addrbits.eq(m_in.addr[amin : amax])
 636         with m.Else():
 637             comb += addrbits.eq(d_in.addr[amin : amax])
 638         comb += index.eq(addrbits)
 639
 640         # If we have any op and the previous op isn't finished,
 641         # then keep the same output for next cycle.
 642         with m.If(~r0_stall):
 643             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 644             sync += tlb_tag_way.eq(dtlb_tags[index])
 645             sync += tlb_pte_way.eq(dtlb_ptes[index])
 646
 647     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 648         """Generate TLB PLRUs
 649         """
 650         comb = m.d.comb
 651         sync = m.d.sync
 652
 653         if TLB_NUM_WAYS == 0:
 654             return
 655         for i in range(TLB_SET_SIZE):
 656             # TLB PLRU interface
 657             tlb_plru        = PLRU(TLB_WAY_BITS)
 658             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 659             tlb_plru_acc_en = Signal()
 660
 661             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 662             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 663             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 664             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 665
 666     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 667                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 668                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 669
 670         comb = m.d.comb
 671         sync = m.d.sync
 672
 673         hitway = Signal(TLB_WAY_BITS)
 674         hit    = Signal()
 675         eatag  = Signal(TLB_EA_TAG_BITS)
 676
 677         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 678         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 679         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 680
 681         for i in range(TLB_NUM_WAYS):
 682             is_tag_hit = Signal()
 683             comb += is_tag_hit.eq(tlb_valid_way[i]
 684                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 685             with m.If(is_tag_hit):
 686                 comb += hitway.eq(i)
 687                 comb += hit.eq(1)
 688
 689         comb += tlb_hit.eq(hit & r0_valid)
 690         comb += tlb_hit_way.eq(hitway)
 691
 692         with m.If(tlb_hit):
 693             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 694         with m.Else():
 695             comb += pte.eq(0)
 696         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 697         with m.If(r0.req.virt_mode):
 698             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 699                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 700                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 701             comb += perm_attr.reference.eq(pte[8])
 702             comb += perm_attr.changed.eq(pte[7])
 703             comb += perm_attr.nocache.eq(pte[5])
 704             comb += perm_attr.priv.eq(pte[3])
 705             comb += perm_attr.rd_perm.eq(pte[2])
 706             comb += perm_attr.wr_perm.eq(pte[1])
 707         with m.Else():
 708             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 709                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 710
 711             comb += perm_attr.reference.eq(1)
 712             comb += perm_attr.changed.eq(1)
 713             comb += perm_attr.nocache.eq(0)
 714             comb += perm_attr.priv.eq(1)
 715             comb += perm_attr.rd_perm.eq(1)
 716             comb += perm_attr.wr_perm.eq(1)
 717
 718     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 719                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 720                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 721
 722         comb = m.d.comb
 723         sync = m.d.sync
 724
 725         tlbie    = Signal()
 726         tlbwe    = Signal()
 727
 728         comb += tlbie.eq(r0_valid & r0.tlbie)
 729         comb += tlbwe.eq(r0_valid & r0.tlbld)
 730
 731         m.submodules.tlb_update = d = DTLBUpdate()
 732         with m.If(tlbie & r0.doall):
 733             # clear all valid bits at once
 734             for i in range(TLB_SET_SIZE):
 735                 sync += dtlb_valid_bits[i].eq(0)
 736         with m.If(d.updated):
 737             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 738             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 739         with m.If(d.v_updated):
 740             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 741
 742         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 743
 744         comb += d.tlbie.eq(tlbie)
 745         comb += d.tlbwe.eq(tlbwe)
 746         comb += d.doall.eq(r0.doall)
 747         comb += d.tlb_hit.eq(tlb_hit)
 748         comb += d.tlb_hit_way.eq(tlb_hit_way)
 749         comb += d.tlb_tag_way.eq(tlb_tag_way)
 750         comb += d.tlb_pte_way.eq(tlb_pte_way)
 751         comb += d.tlb_req_index.eq(tlb_req_index)
 752
 753         with m.If(tlb_hit):
 754             comb += d.repl_way.eq(tlb_hit_way)
 755         with m.Else():
 756             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 757         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 758         comb += d.pte_data.eq(r0.req.data)
 759
 760     def maybe_plrus(self, m, r1, plru_victim):
 761         """Generate PLRUs
 762         """
 763         comb = m.d.comb
 764         sync = m.d.sync
 765
 766         if TLB_NUM_WAYS == 0:
 767             return
 768
 769         for i in range(NUM_LINES):
 770             # PLRU interface
 771             plru        = PLRU(WAY_BITS)
 772             setattr(m.submodules, "plru%d" % i, plru)
 773             plru_acc_en = Signal()
 774
 775             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 776             comb += plru.acc_en.eq(plru_acc_en)
 777             comb += plru.acc_i.eq(r1.hit_way)
 778             comb += plru_victim[i].eq(plru.lru_o)
 779
 780     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 781         """Cache tag RAM read port
 782         """
 783         comb = m.d.comb
 784         sync = m.d.sync
 785         m_in, d_in = self.m_in, self.d_in
 786
 787         index = Signal(INDEX_BITS)
 788
 789         with m.If(r0_stall):
 790             comb += index.eq(req_index)
 791         with m.Elif(m_in.valid):
 792             comb += index.eq(get_index(m_in.addr))
 793         with m.Else():
 794             comb += index.eq(get_index(d_in.addr))
 795         sync += cache_tag_set.eq(cache_tags[index])
 796
 797     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 798                        r0_valid, r1, cache_valids, replace_way,
 799                        use_forward1_next, use_forward2_next,
 800                        req_hit_way, plru_victim, rc_ok, perm_attr,
 801                        valid_ra, perm_ok, access_ok, req_op, req_go,
 802                        tlb_pte_way,
 803                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 804                        cancel_store, req_same_tag, r0_stall, early_req_row):
 805         """Cache request parsing and hit detection
 806         """
 807
 808         comb = m.d.comb
 809         sync = m.d.sync
 810         m_in, d_in = self.m_in, self.d_in
 811
 812         is_hit      = Signal()
 813         hit_way     = Signal(WAY_BITS)
 814         op          = Signal(Op)
 815         opsel       = Signal(3)
 816         go          = Signal()
 817         nc          = Signal()
 818         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 819                                   for i in range(TLB_NUM_WAYS))
 820         cache_valid_idx = Signal(NUM_WAYS)
 821
 822         # Extract line, row and tag from request
 823         comb += req_index.eq(get_index(r0.req.addr))
 824         comb += req_row.eq(get_row(r0.req.addr))
 825         comb += req_tag.eq(get_tag(ra))
 826
 827         if False: # display on comb is a bit... busy.
 828             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 829                     r0.req.addr, ra, req_index, req_tag, req_row)
 830
 831         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 832         comb += cache_valid_idx.eq(cache_valids[req_index])
 833
 834         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 835                                 tlb_valid_way, tlb_hit_way,
 836                                 cache_valid_idx, cache_tag_set,
 837                                 r0.req.addr,
 838                                 hit_set)
 839
 840         comb += dc.tlb_hit.eq(tlb_hit)
 841         comb += dc.reload_tag.eq(r1.reload_tag)
 842         comb += dc.virt_mode.eq(r0.req.virt_mode)
 843         comb += dc.go.eq(go)
 844         comb += dc.req_index.eq(req_index)
 845         comb += is_hit.eq(dc.is_hit)
 846         comb += hit_way.eq(dc.hit_way)
 847         comb += req_same_tag.eq(dc.rel_match)
 848
 849         # See if the request matches the line currently being reloaded
 850         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 851                   (req_index == r1.store_index) & req_same_tag):
 852             # For a store, consider this a hit even if the row isn't
 853             # valid since it will be by the time we perform the store.
 854             # For a load, check the appropriate row valid bit.
 855             rrow = Signal(ROW_LINE_BITS)
 856             comb += rrow.eq(req_row)
 857             valid = r1.rows_valid[rrow]
 858             comb += is_hit.eq(~r0.req.load | valid)
 859             comb += hit_way.eq(replace_way)
 860
 861         # Whether to use forwarded data for a load or not
 862         with m.If((get_row(r1.req.real_addr) == req_row) &
 863                   (r1.req.hit_way == hit_way)):
 864             # Only need to consider r1.write_bram here, since if we
 865             # are writing refill data here, then we don't have a
 866             # cache hit this cycle on the line being refilled.
 867             # (There is the possibility that the load following the
 868             # load miss that started the refill could be to the old
 869             # contents of the victim line, since it is a couple of
 870             # cycles after the refill starts before we see the updated
 871             # cache tag. In that case we don't use the bypass.)
 872             comb += use_forward1_next.eq(r1.write_bram)
 873         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 874             comb += use_forward2_next.eq(r1.forward_valid1)
 875
 876         # The way that matched on a hit
 877         comb += req_hit_way.eq(hit_way)
 878
 879         # The way to replace on a miss
 880         with m.If(r1.write_tag):
 881             comb += replace_way.eq(plru_victim[r1.store_index])
 882         with m.Else():
 883             comb += replace_way.eq(r1.store_way)
 884
 885         # work out whether we have permission for this access
 886         # NB we don't yet implement AMR, thus no KUAP
 887         comb += rc_ok.eq(perm_attr.reference
 888                          & (r0.req.load | perm_attr.changed)
 889                 )
 890         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 891                            (perm_attr.wr_perm |
 892                               (r0.req.load & perm_attr.rd_perm)))
 893         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 894         # Combine the request and cache hit status to decide what
 895         # operation needs to be done
 896         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 897         comb += op.eq(Op.OP_NONE)
 898         with m.If(go):
 899             with m.If(~access_ok):
 900                 comb += op.eq(Op.OP_BAD)
 901             with m.Elif(cancel_store):
 902                 comb += op.eq(Op.OP_STCX_FAIL)
 903             with m.Else():
 904                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 905                 with m.Switch(opsel):
 906                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 907                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 908                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 909                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 910                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 911                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 912                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 913                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 914         comb += req_op.eq(op)
 915         comb += req_go.eq(go)
 916
 917         # Version of the row number that is valid one cycle earlier
 918         # in the cases where we need to read the cache data BRAM.
 919         # If we're stalling then we need to keep reading the last
 920         # row requested.
 921         with m.If(~r0_stall):
 922             with m.If(m_in.valid):
 923                 comb += early_req_row.eq(get_row(m_in.addr))
 924             with m.Else():
 925                 comb += early_req_row.eq(get_row(d_in.addr))
 926         with m.Else():
 927             comb += early_req_row.eq(req_row)
 928
 929     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 930                          r0_valid, r0, reservation):
 931         """Handle load-with-reservation and store-conditional instructions
 932         """
 933         comb = m.d.comb
 934         sync = m.d.sync
 935
 936         with m.If(r0_valid & r0.req.reserve):
 937             # XXX generate alignment interrupt if address
 938             # is not aligned XXX or if r0.req.nc = '1'
 939             with m.If(r0.req.load):
 940                 comb += set_rsrv.eq(1) # load with reservation
 941             with m.Else():
 942                 comb += clear_rsrv.eq(1) # store conditional
 943                 with m.If(~reservation.valid |
 944                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 945                     comb += cancel_store.eq(1)
 946
 947     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 948                         reservation, r0):
 949
 950         comb = m.d.comb
 951         sync = m.d.sync
 952
 953         with m.If(r0_valid & access_ok):
 954             with m.If(clear_rsrv):
 955                 sync += reservation.valid.eq(0)
 956             with m.Elif(set_rsrv):
 957                 sync += reservation.valid.eq(1)
 958                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 959
 960     def writeback_control(self, m, r1, cache_out):
 961         """Return data for loads & completion control logic
 962         """
 963         comb = m.d.comb
 964         sync = m.d.sync
 965         d_out, m_out = self.d_out, self.m_out
 966
 967         data_out = Signal(64)
 968         data_fwd = Signal(64)
 969
 970         # Use the bypass if are reading the row that was
 971         # written 1 or 2 cycles ago, including for the
 972         # slow_valid = 1 case (i.e. completing a load
 973         # miss or a non-cacheable load).
 974         with m.If(r1.use_forward1):
 975             comb += data_fwd.eq(r1.forward_data1)
 976         with m.Else():
 977             comb += data_fwd.eq(r1.forward_data2)
 978
 979         comb += data_out.eq(cache_out[r1.hit_way])
 980
 981         for i in range(8):
 982             with m.If(r1.forward_sel[i]):
 983                 dsel = data_fwd.word_select(i, 8)
 984                 comb += data_out.word_select(i, 8).eq(dsel)
 985
 986         comb += d_out.valid.eq(r1.ls_valid)
 987         comb += d_out.data.eq(data_out)
 988         comb += d_out.store_done.eq(~r1.stcx_fail)
 989         comb += d_out.error.eq(r1.ls_error)
 990         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 991
 992         # Outputs to MMU
 993         comb += m_out.done.eq(r1.mmu_done)
 994         comb += m_out.err.eq(r1.mmu_error)
 995         comb += m_out.data.eq(data_out)
 996
 997         # We have a valid load or store hit or we just completed
 998         # a slow op such as a load miss, a NC load or a store
 999         #
1000         # Note: the load hit is delayed by one cycle. However it
1001         # can still not collide with r.slow_valid (well unless I
1002         # miscalculated) because slow_valid can only be set on a
1003         # subsequent request and not on its first cycle (the state
1004         # machine must have advanced), which makes slow_valid
1005         # at least 2 cycles from the previous hit_load_valid.
1006
1007         # Sanity: Only one of these must be set in any given cycle
1008
1009         if False: # TODO: need Display to get this to work
1010             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1011             "unexpected slow_valid collision with stcx_fail"
1012
1013             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1014              "unexpected hit_load_delayed collision with slow_valid"
1015
1016         with m.If(~r1.mmu_req):
1017             # Request came from loadstore1...
1018             # Load hit case is the standard path
1019             with m.If(r1.hit_load_valid):
1020                 sync += Display("completing load hit data=%x", data_out)
1021
1022             # error cases complete without stalling
1023             with m.If(r1.ls_error):
1024                 sync += Display("completing ld/st with error")
1025
1026             # Slow ops (load miss, NC, stores)
1027             with m.If(r1.slow_valid):
1028                 sync += Display("completing store or load miss data=%x",
1029                                 data_out)
1030
1031         with m.Else():
1032             # Request came from MMU
1033             with m.If(r1.hit_load_valid):
1034                 sync += Display("completing load hit to MMU, data=%x",
1035                                 m_out.data)
1036             # error cases complete without stalling
1037             with m.If(r1.mmu_error):
1038                 sync += Display("combpleting MMU ld with error")
1039
1040             # Slow ops (i.e. load miss)
1041             with m.If(r1.slow_valid):
1042                 sync += Display("completing MMU load miss, data=%x",
1043                                 m_out.data)
1044
1045     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1046         """rams
1047         Generate a cache RAM for each way. This handles the normal
1048         reads, writes from reloads and the special store-hit update
1049         path as well.
1050
1051         Note: the BRAMs have an extra read buffer, meaning the output
1052         is pipelined an extra cycle. This differs from the
1053         icache. The writeback logic needs to take that into
1054         account by using 1-cycle delayed signals for load hits.
1055         """
1056         comb = m.d.comb
1057         wb_in = self.wb_in
1058
1059         for i in range(NUM_WAYS):
1060             do_read  = Signal(name="do_rd%d" % i)
1061             rd_addr  = Signal(ROW_BITS)
1062             do_write = Signal(name="do_wr%d" % i)
1063             wr_addr  = Signal(ROW_BITS)
1064             wr_data  = Signal(WB_DATA_BITS)
1065             wr_sel   = Signal(ROW_SIZE)
1066             wr_sel_m = Signal(ROW_SIZE)
1067             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
1068
1069             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1070             setattr(m.submodules, "cacheram_%d" % i, way)
1071
1072             comb += way.rd_en.eq(do_read)
1073             comb += way.rd_addr.eq(rd_addr)
1074             comb += _d_out.eq(way.rd_data_o)
1075             comb += way.wr_sel.eq(wr_sel_m)
1076             comb += way.wr_addr.eq(wr_addr)
1077             comb += way.wr_data.eq(wr_data)
1078
1079             # Cache hit reads
1080             comb += do_read.eq(1)
1081             comb += rd_addr.eq(early_req_row[:ROW_BITS])
1082             comb += cache_out[i].eq(_d_out)
1083
1084             # Write mux:
1085             #
1086             # Defaults to wishbone read responses (cache refill)
1087             #
1088             # For timing, the mux on wr_data/sel/addr is not
1089             # dependent on anything other than the current state.
1090
1091             with m.If(r1.write_bram):
1092                 # Write store data to BRAM.  This happens one
1093                 # cycle after the store is in r0.
1094                 comb += wr_data.eq(r1.req.data)
1095                 comb += wr_sel.eq(r1.req.byte_sel)
1096                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1097
1098                 with m.If(i == r1.req.hit_way):
1099                     comb += do_write.eq(1)
1100             with m.Else():
1101                 # Otherwise, we might be doing a reload or a DCBZ
1102                 with m.If(r1.dcbz):
1103                     comb += wr_data.eq(0)
1104                 with m.Else():
1105                     comb += wr_data.eq(wb_in.dat)
1106                 comb += wr_addr.eq(r1.store_row)
1107                 comb += wr_sel.eq(~0) # all 1s
1108
1109             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1110                       & wb_in.ack & (replace_way == i)):
1111                 comb += do_write.eq(1)
1112
1113             # Mask write selects with do_write since BRAM
1114             # doesn't have a global write-enable
1115             with m.If(do_write):
1116                 comb += wr_sel_m.eq(wr_sel)
1117
1118     # Cache hit synchronous machine for the easy case.
1119     # This handles load hits.
1120     # It also handles error cases (TLB miss, cache paradox)
1121     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1122                         req_hit_way, req_index, req_tag, access_ok,
1123                         tlb_hit, tlb_hit_way, tlb_req_index):
1124
1125         comb = m.d.comb
1126         sync = m.d.sync
1127
1128         with m.If(req_op != Op.OP_NONE):
1129             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1130                     req_op, r0.req.addr, r0.req.nc,
1131                     req_index, req_tag, req_hit_way)
1132
1133         with m.If(r0_valid):
1134             sync += r1.mmu_req.eq(r0.mmu_req)
1135
1136         # Fast path for load/store hits.
1137         # Set signals for the writeback controls.
1138         sync += r1.hit_way.eq(req_hit_way)
1139         sync += r1.hit_index.eq(req_index)
1140
1141         with m.If(req_op == Op.OP_LOAD_HIT):
1142             sync += r1.hit_load_valid.eq(1)
1143         with m.Else():
1144             sync += r1.hit_load_valid.eq(0)
1145
1146         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1147             sync += r1.cache_hit.eq(1)
1148         with m.Else():
1149             sync += r1.cache_hit.eq(0)
1150
1151         with m.If(req_op == Op.OP_BAD):
1152             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1153             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1154             sync += r1.ls_error.eq(~r0.mmu_req)
1155             sync += r1.mmu_error.eq(r0.mmu_req)
1156             sync += r1.cache_paradox.eq(access_ok)
1157
1158             with m.Else():
1159                 sync += r1.ls_error.eq(0)
1160                 sync += r1.mmu_error.eq(0)
1161                 sync += r1.cache_paradox.eq(0)
1162
1163         with m.If(req_op == Op.OP_STCX_FAIL):
1164             r1.stcx_fail.eq(1)
1165         with m.Else():
1166             sync += r1.stcx_fail.eq(0)
1167
1168         # Record TLB hit information for updating TLB PLRU
1169         sync += r1.tlb_hit.eq(tlb_hit)
1170         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1171         sync += r1.tlb_hit_index.eq(tlb_req_index)
1172
1173     # Memory accesses are handled by this state machine:
1174     #
1175     #   * Cache load miss/reload (in conjunction with "rams")
1176     #   * Load hits for non-cachable forms
1177     #   * Stores (the collision case is handled in "rams")
1178     #
1179     # All wishbone requests generation is done here.
1180     # This machine operates at stage 1.
1181     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1182                     cache_valids, r0, replace_way,
1183                     req_hit_way, req_same_tag,
1184                     r0_valid, req_op, cache_tags, req_go, ra):
1185
1186         comb = m.d.comb
1187         sync = m.d.sync
1188         wb_in = self.wb_in
1189
1190         req         = MemAccessRequest("mreq_ds")
1191         acks        = Signal(3)
1192         adjust_acks = Signal(3)
1193
1194         req_row = Signal(ROW_BITS)
1195         req_idx = Signal(INDEX_BITS)
1196         req_tag = Signal(TAG_BITS)
1197         comb += req_idx.eq(get_index(req.real_addr))
1198         comb += req_row.eq(get_row(req.real_addr))
1199         comb += req_tag.eq(get_tag(req.real_addr))
1200
1201         sync += r1.use_forward1.eq(use_forward1_next)
1202         sync += r1.forward_sel.eq(0)
1203
1204         with m.If(use_forward1_next):
1205             sync += r1.forward_sel.eq(r1.req.byte_sel)
1206         with m.Elif(use_forward2_next):
1207             sync += r1.forward_sel.eq(r1.forward_sel1)
1208
1209         sync += r1.forward_data2.eq(r1.forward_data1)
1210         with m.If(r1.write_bram):
1211             sync += r1.forward_data1.eq(r1.req.data)
1212             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1213             sync += r1.forward_way1.eq(r1.req.hit_way)
1214             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1215             sync += r1.forward_valid1.eq(1)
1216         with m.Else():
1217             with m.If(r1.dcbz):
1218                 sync += r1.forward_data1.eq(0)
1219             with m.Else():
1220                 sync += r1.forward_data1.eq(wb_in.dat)
1221             sync += r1.forward_sel1.eq(~0) # all 1s
1222             sync += r1.forward_way1.eq(replace_way)
1223             sync += r1.forward_row1.eq(r1.store_row)
1224             sync += r1.forward_valid1.eq(0)
1225
1226         # One cycle pulses reset
1227         sync += r1.slow_valid.eq(0)
1228         sync += r1.write_bram.eq(0)
1229         sync += r1.inc_acks.eq(0)
1230         sync += r1.dec_acks.eq(0)
1231
1232         sync += r1.ls_valid.eq(0)
1233         # complete tlbies and TLB loads in the third cycle
1234         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1235
1236         with m.If((req_op == Op.OP_LOAD_HIT)
1237                   | (req_op == Op.OP_STCX_FAIL)):
1238             with m.If(~r0.mmu_req):
1239                 sync += r1.ls_valid.eq(1)
1240             with m.Else():
1241                 sync += r1.mmu_done.eq(1)
1242
1243         with m.If(r1.write_tag):
1244             # Store new tag in selected way
1245             for i in range(NUM_WAYS):
1246                 with m.If(i == replace_way):
1247                     ct = Signal(TAG_RAM_WIDTH)
1248                     comb += ct.eq(cache_tags[r1.store_index])
1249                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1250                     sync += cache_tags[r1.store_index].eq(ct)
1251             sync += r1.store_way.eq(replace_way)
1252             sync += r1.write_tag.eq(0)
1253
1254         # Take request from r1.req if there is one there,
1255         # else from req_op, ra, etc.
1256         with m.If(r1.full):
1257             comb += req.eq(r1.req)
1258         with m.Else():
1259             comb += req.op.eq(req_op)
1260             comb += req.valid.eq(req_go)
1261             comb += req.mmu_req.eq(r0.mmu_req)
1262             comb += req.dcbz.eq(r0.req.dcbz)
1263             comb += req.real_addr.eq(ra)
1264
1265             with m.If(~r0.req.dcbz):
1266                 comb += req.data.eq(r0.req.data)
1267             with m.Else():
1268                 comb += req.data.eq(0)
1269
1270             # Select all bytes for dcbz
1271             # and for cacheable loads
1272             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1273                 comb += req.byte_sel.eq(~0) # all 1s
1274             with m.Else():
1275                 comb += req.byte_sel.eq(r0.req.byte_sel)
1276             comb += req.hit_way.eq(req_hit_way)
1277             comb += req.same_tag.eq(req_same_tag)
1278
1279             # Store the incoming request from r0,
1280             # if it is a slow request
1281             # Note that r1.full = 1 implies req_op = OP_NONE
1282             with m.If((req_op == Op.OP_LOAD_MISS)
1283                       | (req_op == Op.OP_LOAD_NC)
1284                       | (req_op == Op.OP_STORE_MISS)
1285                       | (req_op == Op.OP_STORE_HIT)):
1286                 sync += r1.req.eq(req)
1287                 sync += r1.full.eq(1)
1288
1289         # Main state machine
1290         with m.Switch(r1.state):
1291
1292             with m.Case(State.IDLE):
1293                 sync += r1.real_adr.eq(req.real_addr)
1294                 sync += r1.wb.sel.eq(req.byte_sel)
1295                 sync += r1.wb.dat.eq(req.data)
1296                 sync += r1.dcbz.eq(req.dcbz)
1297
1298                 # Keep track of our index and way
1299                 # for subsequent stores.
1300                 sync += r1.store_index.eq(req_idx)
1301                 sync += r1.store_row.eq(req_row)
1302                 sync += r1.end_row_ix.eq(get_row_of_line(req_row))
1303                 sync += r1.reload_tag.eq(req_tag)
1304                 sync += r1.req.same_tag.eq(1)
1305
1306                 with m.If(req.op == Op.OP_STORE_HIT):
1307                     sync += r1.store_way.eq(req.hit_way)
1308
1309                 # Reset per-row valid bits,
1310                 # ready for handling OP_LOAD_MISS
1311                 for i in range(ROW_PER_LINE):
1312                     sync += r1.rows_valid[i].eq(0)
1313
1314                 with m.If(req_op != Op.OP_NONE):
1315                     sync += Display("cache op %d", req.op)
1316
1317                 with m.Switch(req.op):
1318                     with m.Case(Op.OP_LOAD_HIT):
1319                         # stay in IDLE state
1320                         pass
1321
1322                     with m.Case(Op.OP_LOAD_MISS):
1323                         sync += Display("cache miss real addr: %x " \
1324                                 "idx: %x tag: %x",
1325                                 req.real_addr, req_row, req_tag)
1326
1327                         # Start the wishbone cycle
1328                         sync += r1.wb.we.eq(0)
1329                         sync += r1.wb.cyc.eq(1)
1330                         sync += r1.wb.stb.eq(1)
1331
1332                         # Track that we had one request sent
1333                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1334                         sync += r1.write_tag.eq(1)
1335
1336                     with m.Case(Op.OP_LOAD_NC):
1337                         sync += r1.wb.cyc.eq(1)
1338                         sync += r1.wb.stb.eq(1)
1339                         sync += r1.wb.we.eq(0)
1340                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1341
1342                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1343                         with m.If(~req.dcbz):
1344                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1345                             sync += r1.acks_pending.eq(1)
1346                             sync += r1.full.eq(0)
1347                             sync += r1.slow_valid.eq(1)
1348
1349                             with m.If(~req.mmu_req):
1350                                 sync += r1.ls_valid.eq(1)
1351                             with m.Else():
1352                                 sync += r1.mmu_done.eq(1)
1353
1354                             with m.If(req.op == Op.OP_STORE_HIT):
1355                                 sync += r1.write_bram.eq(1)
1356                         with m.Else():
1357                             # dcbz is handled much like a load miss except
1358                             # that we are writing to memory instead of reading
1359                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1360
1361                             with m.If(req.op == Op.OP_STORE_MISS):
1362                                 sync += r1.write_tag.eq(1)
1363
1364                         sync += r1.wb.we.eq(1)
1365                         sync += r1.wb.cyc.eq(1)
1366                         sync += r1.wb.stb.eq(1)
1367
1368                     # OP_NONE and OP_BAD do nothing
1369                     # OP_BAD & OP_STCX_FAIL were
1370                     # handled above already
1371                     with m.Case(Op.OP_NONE):
1372                         pass
1373                     with m.Case(Op.OP_BAD):
1374                         pass
1375                     with m.Case(Op.OP_STCX_FAIL):
1376                         pass
1377
1378             with m.Case(State.RELOAD_WAIT_ACK):
1379                 ld_stbs_done = Signal()
1380                 # Requests are all sent if stb is 0
1381                 comb += ld_stbs_done.eq(~r1.wb.stb)
1382
1383                 with m.If((~wb_in.stall) & r1.wb.stb):
1384                     # That was the last word?
1385                     # We are done sending.
1386                     # Clear stb and set ld_stbs_done
1387                     # so we can handle an eventual
1388                     # last ack on the same cycle.
1389                     with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
1390                         sync += r1.wb.stb.eq(0)
1391                         comb += ld_stbs_done.eq(1)
1392
1393                     # Calculate the next row address in the current cache line
1394                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1395                     comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
1396                     sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
1397
1398                 # Incoming acks processing
1399                 sync += r1.forward_valid1.eq(wb_in.ack)
1400                 with m.If(wb_in.ack):
1401                     srow = Signal(ROW_LINE_BITS)
1402                     comb += srow.eq(r1.store_row)
1403                     sync += r1.rows_valid[srow].eq(1)
1404
1405                     # If this is the data we were looking for,
1406                     # we can complete the request next cycle.
1407                     # Compare the whole address in case the
1408                     # request in r1.req is not the one that
1409                     # started this refill.
1410                     with m.If(r1.full & r1.req.same_tag &
1411                               ((r1.dcbz & r1.req.dcbz) |
1412                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1413                                 (r1.store_row == get_row(r1.req.real_addr))):
1414                         sync += r1.full.eq(0)
1415                         sync += r1.slow_valid.eq(1)
1416                         with m.If(~r1.mmu_req):
1417                             sync += r1.ls_valid.eq(1)
1418                         with m.Else():
1419                             sync += r1.mmu_done.eq(1)
1420                         sync += r1.forward_sel.eq(~0) # all 1s
1421                         sync += r1.use_forward1.eq(1)
1422
1423                     # Check for completion
1424                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1425                                                       r1.end_row_ix)):
1426                         # Complete wishbone cycle
1427                         sync += r1.wb.cyc.eq(0)
1428
1429                         # Cache line is now valid
1430                         cv = Signal(INDEX_BITS)
1431                         comb += cv.eq(cache_valids[r1.store_index])
1432                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1433                         sync += cache_valids[r1.store_index].eq(cv)
1434                         sync += r1.state.eq(State.IDLE)
1435
1436                     # Increment store row counter
1437                     sync += r1.store_row.eq(next_row(r1.store_row))
1438
1439             with m.Case(State.STORE_WAIT_ACK):
1440                 st_stbs_done = Signal()
1441                 comb += st_stbs_done.eq(~r1.wb.stb)
1442                 comb += acks.eq(r1.acks_pending)
1443
1444                 with m.If(r1.inc_acks != r1.dec_acks):
1445                     with m.If(r1.inc_acks):
1446                         comb += adjust_acks.eq(acks + 1)
1447                     with m.Else():
1448                         comb += adjust_acks.eq(acks - 1)
1449                 with m.Else():
1450                     comb += adjust_acks.eq(acks)
1451
1452                 sync += r1.acks_pending.eq(adjust_acks)
1453
1454                 # Clear stb when slave accepted request
1455                 with m.If(~wb_in.stall):
1456                     # See if there is another store waiting
1457                     # to be done which is in the same real page.
1458                     with m.If(req.valid):
1459                         ra = req.real_addr[0:SET_SIZE_BITS]
1460                         sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
1461                         sync += r1.wb.dat.eq(req.data)
1462                         sync += r1.wb.sel.eq(req.byte_sel)
1463
1464                     with m.Elif((adjust_acks < 7) & req.same_tag &
1465                                 ((req.op == Op.OP_STORE_MISS)
1466                                  | (req.op == Op.OP_STORE_HIT))):
1467                         sync += r1.wb.stb.eq(1)
1468                         comb += st_stbs_done.eq(0)
1469
1470                         with m.If(req.op == Op.OP_STORE_HIT):
1471                             sync += r1.write_bram.eq(1)
1472                         sync += r1.full.eq(0)
1473                         sync += r1.slow_valid.eq(1)
1474
1475                         # Store requests never come from the MMU
1476                         sync += r1.ls_valid.eq(1)
1477                         comb += st_stbs_done.eq(0)
1478                         sync += r1.inc_acks.eq(1)
1479                     with m.Else():
1480                         sync += r1.wb.stb.eq(0)
1481                         comb += st_stbs_done.eq(1)
1482
1483                 # Got ack ? See if complete.
1484                 with m.If(wb_in.ack):
1485                     with m.If(st_stbs_done & (adjust_acks == 1)):
1486                         sync += r1.state.eq(State.IDLE)
1487                         sync += r1.wb.cyc.eq(0)
1488                         sync += r1.wb.stb.eq(0)
1489                     sync += r1.dec_acks.eq(1)
1490
1491             with m.Case(State.NC_LOAD_WAIT_ACK):
1492                 # Clear stb when slave accepted request
1493                 with m.If(~wb_in.stall):
1494                     sync += r1.wb.stb.eq(0)
1495
1496                 # Got ack ? complete.
1497                 with m.If(wb_in.ack):
1498                     sync += r1.state.eq(State.IDLE)
1499                     sync += r1.full.eq(0)
1500                     sync += r1.slow_valid.eq(1)
1501
1502                     with m.If(~r1.mmu_req):
1503                         sync += r1.ls_valid.eq(1)
1504                     with m.Else():
1505                         sync += r1.mmu_done.eq(1)
1506
1507                     sync += r1.forward_sel.eq(~0) # all 1s
1508                     sync += r1.use_forward1.eq(1)
1509                     sync += r1.wb.cyc.eq(0)
1510                     sync += r1.wb.stb.eq(0)
1511
1512     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1513
1514         sync = m.d.sync
1515         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1516
1517         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1518                                stall_out, req_op[:3], d_out.valid, d_out.error,
1519                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1520                                r1.real_adr[3:6]))
1521
1522     def elaborate(self, platform):
1523
1524         m = Module()
1525         comb = m.d.comb
1526
1527         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1528         cache_tags       = CacheTagArray()
1529         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1530         cache_valids = CacheValidBitsArray()
1531
1532         # TODO attribute ram_style : string;
1533         # TODO attribute ram_style of cache_tags : signal is "distributed";
1534
1535         """note: these are passed to nmigen.hdl.Memory as "attributes".
1536            don't know how, just that they are.
1537         """
1538         dtlb_valid_bits = TLBValidBitsArray()
1539         dtlb_tags       = TLBTagsArray()
1540         dtlb_ptes       = TLBPtesArray()
1541         # TODO attribute ram_style of
1542         #  dtlb_tags : signal is "distributed";
1543         # TODO attribute ram_style of
1544         #  dtlb_ptes : signal is "distributed";
1545
1546         r0      = RegStage0("r0")
1547         r0_full = Signal()
1548
1549         r1 = RegStage1("r1")
1550
1551         reservation = Reservation()
1552
1553         # Async signals on incoming request
1554         req_index    = Signal(INDEX_BITS)
1555         req_row      = Signal(ROW_BITS)
1556         req_hit_way  = Signal(WAY_BITS)
1557         req_tag      = Signal(TAG_BITS)
1558         req_op       = Signal(Op)
1559         req_data     = Signal(64)
1560         req_same_tag = Signal()
1561         req_go       = Signal()
1562
1563         early_req_row     = Signal(ROW_BITS)
1564
1565         cancel_store      = Signal()
1566         set_rsrv          = Signal()
1567         clear_rsrv        = Signal()
1568
1569         r0_valid          = Signal()
1570         r0_stall          = Signal()
1571
1572         use_forward1_next = Signal()
1573         use_forward2_next = Signal()
1574
1575         cache_out         = CacheRamOut()
1576
1577         plru_victim       = PLRUOut()
1578         replace_way       = Signal(WAY_BITS)
1579
1580         # Wishbone read/write/cache write formatting signals
1581         bus_sel           = Signal(8)
1582
1583         # TLB signals
1584         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1585         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1586         tlb_valid_way = Signal(TLB_NUM_WAYS)
1587         tlb_req_index = Signal(TLB_SET_BITS)
1588         tlb_hit       = Signal()
1589         tlb_hit_way   = Signal(TLB_WAY_BITS)
1590         pte           = Signal(TLB_PTE_BITS)
1591         ra            = Signal(REAL_ADDR_BITS)
1592         valid_ra      = Signal()
1593         perm_attr     = PermAttr("dc_perms")
1594         rc_ok         = Signal()
1595         perm_ok       = Signal()
1596         access_ok     = Signal()
1597
1598         tlb_plru_victim = TLBPLRUOut()
1599
1600         # we don't yet handle collisions between loadstore1 requests
1601         # and MMU requests
1602         comb += self.m_out.stall.eq(0)
1603
1604         # Hold off the request in r0 when r1 has an uncompleted request
1605         comb += r0_stall.eq(r0_full & r1.full)
1606         comb += r0_valid.eq(r0_full & ~r1.full)
1607         comb += self.stall_out.eq(r0_stall)
1608
1609         # Wire up wishbone request latch out of stage 1
1610         comb += r1.wb.adr.eq(r1.real_adr[ROW_OFF_BITS:]) # truncate LSBs
1611         comb += self.wb_out.eq(r1.wb)
1612
1613         # call sub-functions putting everything together, using shared
1614         # signals established above
1615         self.stage_0(m, r0, r1, r0_full)
1616         self.tlb_read(m, r0_stall, tlb_valid_way,
1617                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1618                       dtlb_tags, dtlb_ptes)
1619         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1620                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1621                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1622         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1623                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1624                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1625         self.maybe_plrus(m, r1, plru_victim)
1626         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1627         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1628         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1629                            r0_valid, r1, cache_valids, replace_way,
1630                            use_forward1_next, use_forward2_next,
1631                            req_hit_way, plru_victim, rc_ok, perm_attr,
1632                            valid_ra, perm_ok, access_ok, req_op, req_go,
1633                            tlb_pte_way,
1634                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1635                            cancel_store, req_same_tag, r0_stall, early_req_row)
1636         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1637                            r0_valid, r0, reservation)
1638         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1639                            reservation, r0)
1640         self.writeback_control(m, r1, cache_out)
1641         self.rams(m, r1, early_req_row, cache_out, replace_way)
1642         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1643                         req_hit_way, req_index, req_tag, access_ok,
1644                         tlb_hit, tlb_hit_way, tlb_req_index)
1645         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1646                     cache_valids, r0, replace_way,
1647                     req_hit_way, req_same_tag,
1648                          r0_valid, req_op, cache_tags, req_go, ra)
1649         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1650
1651         return m
1652
1653 def dcache_load(dut, addr, nc=0):
1654     yield dut.d_in.load.eq(1)
1655     yield dut.d_in.nc.eq(nc)
1656     yield dut.d_in.addr.eq(addr)
1657     yield dut.d_in.byte_sel.eq(~0)
1658     yield dut.d_in.valid.eq(1)
1659     yield
1660     yield dut.d_in.valid.eq(0)
1661     yield dut.d_in.byte_sel.eq(0)
1662     yield
1663     while not (yield dut.d_out.valid):
1664         yield
1665     data = yield dut.d_out.data
1666     return data
1667
1668
1669 def dcache_store(dut, addr, data, nc=0):
1670     yield dut.d_in.load.eq(0)
1671     yield dut.d_in.nc.eq(nc)
1672     yield dut.d_in.data.eq(data)
1673     yield dut.d_in.byte_sel.eq(~0)
1674     yield dut.d_in.addr.eq(addr)
1675     yield dut.d_in.valid.eq(1)
1676     yield
1677     yield dut.d_in.valid.eq(0)
1678     yield dut.d_in.byte_sel.eq(0)
1679     yield
1680     while not (yield dut.d_out.valid):
1681         yield
1682
1683
1684 def dcache_random_sim(dut):
1685
1686     # start with stack of zeros
1687     sim_mem = [0] * 512
1688
1689     # clear stuff
1690     yield dut.d_in.valid.eq(0)
1691     yield dut.d_in.load.eq(0)
1692     yield dut.d_in.priv_mode.eq(1)
1693     yield dut.d_in.nc.eq(0)
1694     yield dut.d_in.addr.eq(0)
1695     yield dut.d_in.data.eq(0)
1696     yield dut.m_in.valid.eq(0)
1697     yield dut.m_in.addr.eq(0)
1698     yield dut.m_in.pte.eq(0)
1699     # wait 4 * clk_period
1700     yield
1701     yield
1702     yield
1703     yield
1704
1705     print ()
1706
1707     for i in range(256):
1708         addr = randint(0, 255)
1709         data = randint(0, (1<<64)-1)
1710         sim_mem[addr] = data
1711         addr *= 8
1712
1713         print ("testing %x data %x" % (addr, data))
1714
1715         yield from dcache_load(dut, addr)
1716         yield from dcache_store(dut, addr, data)
1717
1718         addr = randint(0, 255)
1719         sim_data = sim_mem[addr]
1720         addr *= 8
1721
1722         data = yield from dcache_load(dut, addr)
1723         assert data == sim_data, \
1724             "check %x data %x != %x" % (addr, data, sim_data)
1725
1726     for addr in range(256):
1727         data = yield from dcache_load(dut, addr*8)
1728         assert data == sim_mem[addr], \
1729             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1730
1731 def dcache_sim(dut):
1732     # clear stuff
1733     yield dut.d_in.valid.eq(0)
1734     yield dut.d_in.load.eq(0)
1735     yield dut.d_in.priv_mode.eq(1)
1736     yield dut.d_in.nc.eq(0)
1737     yield dut.d_in.addr.eq(0)
1738     yield dut.d_in.data.eq(0)
1739     yield dut.m_in.valid.eq(0)
1740     yield dut.m_in.addr.eq(0)
1741     yield dut.m_in.pte.eq(0)
1742     # wait 4 * clk_period
1743     yield
1744     yield
1745     yield
1746     yield
1747
1748     # Cacheable read of address 4
1749     data = yield from dcache_load(dut, 0x58)
1750     addr = yield dut.d_in.addr
1751     assert data == 0x0000001700000016, \
1752         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1753
1754     # Cacheable read of address 20
1755     data = yield from dcache_load(dut, 0x20)
1756     addr = yield dut.d_in.addr
1757     assert data == 0x0000000900000008, \
1758         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1759
1760     # Cacheable read of address 30
1761     data = yield from dcache_load(dut, 0x530)
1762     addr = yield dut.d_in.addr
1763     assert data == 0x0000014D0000014C, \
1764         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1765
1766     # 2nd Cacheable read of address 30
1767     data = yield from dcache_load(dut, 0x530)
1768     addr = yield dut.d_in.addr
1769     assert data == 0x0000014D0000014C, \
1770         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1771
1772     # Non-cacheable read of address 100
1773     data = yield from dcache_load(dut, 0x100, nc=1)
1774     addr = yield dut.d_in.addr
1775     assert data == 0x0000004100000040, \
1776         f"data @%x=%x expected 0000004100000040" % (addr, data)
1777
1778     # Store at address 530
1779     yield from dcache_store(dut, 0x530, 0x121)
1780
1781     # Store at address 30
1782     yield from dcache_store(dut, 0x530, 0x12345678)
1783
1784     # 3nd Cacheable read of address 530
1785     data = yield from dcache_load(dut, 0x530)
1786     addr = yield dut.d_in.addr
1787     assert data == 0x12345678, \
1788         f"data @%x=%x expected 0x12345678" % (addr, data)
1789
1790     # 4th Cacheable read of address 20
1791     data = yield from dcache_load(dut, 0x20)
1792     addr = yield dut.d_in.addr
1793     assert data == 0x0000000900000008, \
1794         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1795
1796     yield
1797     yield
1798     yield
1799     yield
1800
1801
1802 def test_dcache(mem, test_fn, test_name):
1803     dut = DCache()
1804
1805     memory = Memory(width=64, depth=16*64, init=mem)
1806     sram = SRAM(memory=memory, granularity=8)
1807
1808     m = Module()
1809     m.submodules.dcache = dut
1810     m.submodules.sram = sram
1811
1812     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1813     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1814     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1815     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1816     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1817     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1818
1819     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1820     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1821
1822     # nmigen Simulation
1823     sim = Simulator(m)
1824     sim.add_clock(1e-6)
1825
1826     sim.add_sync_process(wrap(test_fn(dut)))
1827     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1828         sim.run()
1829
1830 if __name__ == '__main__':
1831     dut = DCache()
1832     vl = rtlil.convert(dut, ports=[])
1833     with open("test_dcache.il", "w") as f:
1834         f.write(vl)
1835
1836     mem = []
1837     for i in range(0,512):
1838         mem.append((i*2)| ((i*2+1)<<32))
1839
1840     test_dcache(mem, dcache_sim, "")
1841     test_dcache(None, dcache_random_sim, "random")
1842