src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record)
  29 from nmutil.util import Display
  30
  31 from copy import deepcopy
  32 from random import randint, seed
  33
  34 from nmigen_soc.wishbone.bus import Interface
  35
  36 from nmigen.cli import main
  37 from nmutil.iocontrol import RecordObject
  38 from nmigen.utils import log2_int
  39 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  40                                      DCacheToLoadStore1Type,
  41                                      MMUToDCacheType,
  42                                      DCacheToMMUType)
  43
  44 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  45                                 WBAddrType, WBDataType, WBSelType,
  46                                 WBMasterOut, WBSlaveOut,
  47                                 WBMasterOutVector, WBSlaveOutVector,
  48                                 WBIOMasterOut, WBIOSlaveOut)
  49
  50 from soc.experiment.cache_ram import CacheRam
  51 #from soc.experiment.plru import PLRU
  52 from nmutil.plru import PLRU
  53
  54 # for test
  55 from soc.bus.sram import SRAM
  56 from nmigen import Memory
  57 from nmigen.cli import rtlil
  58
  59 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  60 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  61 from nmutil.sim_tmp_alternative import Simulator
  62
  63 from nmutil.util import wrap
  64
  65
  66 # TODO: make these parameters of DCache at some point
  67 LINE_SIZE = 64    # Line size in bytes
  68 NUM_LINES = 16    # Number of lines in a set
  69 NUM_WAYS = 4      # Number of ways
  70 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  71 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  72 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  73 LOG_LENGTH = 0    # Non-zero to enable log data collection
  74
  75 # BRAM organisation: We never access more than
  76 #     -- WB_DATA_BITS at a time so to save
  77 #     -- resources we make the array only that wide, and
  78 #     -- use consecutive indices to make a cache "line"
  79 #     --
  80 #     -- ROW_SIZE is the width in bytes of the BRAM
  81 #     -- (based on WB, so 64-bits)
  82 ROW_SIZE = WB_DATA_BITS // 8;
  83
  84 # ROW_PER_LINE is the number of row (wishbone
  85 # transactions) in a line
  86 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  87
  88 # BRAM_ROWS is the number of rows in BRAM needed
  89 # to represent the full dcache
  90 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  91
  92 print ("ROW_SIZE", ROW_SIZE)
  93 print ("ROW_PER_LINE", ROW_PER_LINE)
  94 print ("BRAM_ROWS", BRAM_ROWS)
  95 print ("NUM_WAYS", NUM_WAYS)
  96
  97 # Bit fields counts in the address
  98
  99 # REAL_ADDR_BITS is the number of real address
 100 # bits that we store
 101 REAL_ADDR_BITS = 56
 102
 103 # ROW_BITS is the number of bits to select a row
 104 ROW_BITS = log2_int(BRAM_ROWS)
 105
 106 # ROW_LINE_BITS is the number of bits to select
 107 # a row within a line
 108 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 109
 110 # LINE_OFF_BITS is the number of bits for
 111 # the offset in a cache line
 112 LINE_OFF_BITS = log2_int(LINE_SIZE)
 113
 114 # ROW_OFF_BITS is the number of bits for
 115 # the offset in a row
 116 ROW_OFF_BITS = log2_int(ROW_SIZE)
 117
 118 # INDEX_BITS is the number if bits to
 119 # select a cache line
 120 INDEX_BITS = log2_int(NUM_LINES)
 121
 122 # SET_SIZE_BITS is the log base 2 of the set size
 123 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 124
 125 # TAG_BITS is the number of bits of
 126 # the tag part of the address
 127 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 128
 129 # TAG_WIDTH is the width in bits of each way of the tag RAM
 130 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 131
 132 # WAY_BITS is the number of bits to select a way
 133 WAY_BITS = log2_int(NUM_WAYS)
 134
 135 # Example of layout for 32 lines of 64 bytes:
 136 layout = """\
 137   ..  tag    |index|  line  |
 138   ..         |   row   |    |
 139   ..         |     |---|    | ROW_LINE_BITS  (3)
 140   ..         |     |--- - --| LINE_OFF_BITS (6)
 141   ..         |         |- --| ROW_OFF_BITS  (3)
 142   ..         |----- ---|    | ROW_BITS      (8)
 143   ..         |-----|        | INDEX_BITS    (5)
 144   .. --------|              | TAG_BITS      (45)
 145 """
 146 print (layout)
 147 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 148             (TAG_BITS, INDEX_BITS, ROW_BITS,
 149              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 150 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 151 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 152 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 153
 154 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 155
 156 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 157
 158 def CacheTagArray():
 159     tag_layout = [('valid', 1),
 160                   ('tag', TAG_RAM_WIDTH),
 161                  ]
 162     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 163
 164 def RowPerLineValidArray():
 165     return Array(Signal(name="rows_valid%d" % x) \
 166                         for x in range(ROW_PER_LINE))
 167
 168 # L1 TLB
 169 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 170 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 171 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 172 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 173 TLB_PTE_BITS     = 64
 174 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 175
 176 def ispow2(x):
 177     return (1<<log2_int(x, False)) == x
 178
 179 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 180 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 181 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 182 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 183 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 184 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 185         "geometry bits don't add up"
 186 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 187         "geometry bits don't add up"
 188 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 189          "geometry bits don't add up"
 190 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 191 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 192
 193 def TLBHit(name):
 194     return Record([('valid', 1),
 195                    ('way', TLB_WAY_BITS)], name=name)
 196
 197 def TLBTagEAArray():
 198     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 199                 for x in range (TLB_NUM_WAYS))
 200
 201 def TLBRecord(name):
 202     tlb_layout = [('valid', TLB_NUM_WAYS),
 203                   ('tag', TLB_TAG_WAY_BITS),
 204                   ('pte', TLB_PTE_WAY_BITS)
 205                  ]
 206     return Record(tlb_layout, name=name)
 207
 208 def TLBArray():
 209     return Array(TLBRecord(name="tlb%d" % x) for x in range(TLB_SET_SIZE))
 210
 211 def HitWaySet():
 212     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 213                         for x in range(TLB_NUM_WAYS))
 214
 215 # Cache RAM interface
 216 def CacheRamOut():
 217     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 218                  for x in range(NUM_WAYS))
 219
 220 # PLRU output interface
 221 def PLRUOut():
 222     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 223                 for x in range(NUM_LINES))
 224
 225 # TLB PLRU output interface
 226 def TLBPLRUOut():
 227     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 228                 for x in range(TLB_SET_SIZE))
 229
 230 # Helper functions to decode incoming requests
 231 #
 232 # Return the cache line index (tag index) for an address
 233 def get_index(addr):
 234     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 235
 236 # Return the cache row index (data memory) for an address
 237 def get_row(addr):
 238     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 239
 240 # Return the index of a row within a line
 241 def get_row_of_line(row):
 242     return row[:ROW_BITS][:ROW_LINE_BITS]
 243
 244 # Returns whether this is the last row of a line
 245 def is_last_row_addr(addr, last):
 246     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 247
 248 # Returns whether this is the last row of a line
 249 def is_last_row(row, last):
 250     return get_row_of_line(row) == last
 251
 252 # Return the next row in the current cache line. We use a
 253 # dedicated function in order to limit the size of the
 254 # generated adder to be only the bits within a cache line
 255 # (3 bits with default settings)
 256 def next_row(row):
 257     row_v = row[0:ROW_LINE_BITS] + 1
 258     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 259
 260 # Get the tag value from the address
 261 def get_tag(addr):
 262     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 263
 264 # Read a tag from a tag memory row
 265 def read_tag(way, tagset):
 266     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 267
 268 # Read a TLB tag from a TLB tag memory row
 269 def read_tlb_tag(way, tags):
 270     return tags.word_select(way, TLB_EA_TAG_BITS)
 271
 272 # Write a TLB tag to a TLB tag memory row
 273 def write_tlb_tag(way, tags, tag):
 274     return read_tlb_tag(way, tags).eq(tag)
 275
 276 # Read a PTE from a TLB PTE memory row
 277 def read_tlb_pte(way, ptes):
 278     return ptes.word_select(way, TLB_PTE_BITS)
 279
 280 def write_tlb_pte(way, ptes, newpte):
 281     return read_tlb_pte(way, ptes).eq(newpte)
 282
 283
 284 # Record for storing permission, attribute, etc. bits from a PTE
 285 class PermAttr(RecordObject):
 286     def __init__(self, name=None):
 287         super().__init__(name=name)
 288         self.reference = Signal()
 289         self.changed   = Signal()
 290         self.nocache   = Signal()
 291         self.priv      = Signal()
 292         self.rd_perm   = Signal()
 293         self.wr_perm   = Signal()
 294
 295
 296 def extract_perm_attr(pte):
 297     pa = PermAttr()
 298     return pa;
 299
 300
 301 # Type of operation on a "valid" input
 302 @unique
 303 class Op(Enum):
 304     OP_NONE       = 0
 305     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 306     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 307     OP_LOAD_HIT   = 3 # Cache hit on load
 308     OP_LOAD_MISS  = 4 # Load missing cache
 309     OP_LOAD_NC    = 5 # Non-cachable load
 310     OP_STORE_HIT  = 6 # Store hitting cache
 311     OP_STORE_MISS = 7 # Store missing cache
 312
 313
 314 # Cache state machine
 315 @unique
 316 class State(Enum):
 317     IDLE             = 0 # Normal load hit processing
 318     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 319     STORE_WAIT_ACK   = 2 # Store wait ack
 320     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 321
 322
 323 # Dcache operations:
 324 #
 325 # In order to make timing, we use the BRAMs with
 326 # an output buffer, which means that the BRAM
 327 # output is delayed by an extra cycle.
 328 #
 329 # Thus, the dcache has a 2-stage internal pipeline
 330 # for cache hits with no stalls.
 331 #
 332 # All other operations are handled via stalling
 333 # in the first stage.
 334 #
 335 # The second stage can thus complete a hit at the same
 336 # time as the first stage emits a stall for a complex op.
 337 #
 338 # Stage 0 register, basically contains just the latched request
 339
 340 class RegStage0(RecordObject):
 341     def __init__(self, name=None):
 342         super().__init__(name=name)
 343         self.req     = LoadStore1ToDCacheType(name="lsmem")
 344         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 345         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 346         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 347         self.mmu_req = Signal() # indicates source of request
 348         self.d_valid = Signal() # indicates req.data is valid now
 349
 350
 351 class MemAccessRequest(RecordObject):
 352     def __init__(self, name=None):
 353         super().__init__(name=name)
 354         self.op        = Signal(Op)
 355         self.valid     = Signal()
 356         self.dcbz      = Signal()
 357         self.real_addr = Signal(REAL_ADDR_BITS)
 358         self.data      = Signal(64)
 359         self.byte_sel  = Signal(8)
 360         self.hit_way   = Signal(WAY_BITS)
 361         self.same_tag  = Signal()
 362         self.mmu_req   = Signal()
 363
 364
 365 # First stage register, contains state for stage 1 of load hits
 366 # and for the state machine used by all other operations
 367 class RegStage1(RecordObject):
 368     def __init__(self, name=None):
 369         super().__init__(name=name)
 370         # Info about the request
 371         self.full             = Signal() # have uncompleted request
 372         self.mmu_req          = Signal() # request is from MMU
 373         self.req              = MemAccessRequest(name="reqmem")
 374
 375         # Cache hit state
 376         self.hit_way          = Signal(WAY_BITS)
 377         self.hit_load_valid   = Signal()
 378         self.hit_index        = Signal(INDEX_BITS)
 379         self.cache_hit        = Signal()
 380
 381         # TLB hit state
 382         self.tlb_hit          = TLBHit("tlb_hit")
 383         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 384
 385         # 2-stage data buffer for data forwarded from writes to reads
 386         self.forward_data1    = Signal(64)
 387         self.forward_data2    = Signal(64)
 388         self.forward_sel1     = Signal(8)
 389         self.forward_valid1   = Signal()
 390         self.forward_way1     = Signal(WAY_BITS)
 391         self.forward_row1     = Signal(ROW_BITS)
 392         self.use_forward1     = Signal()
 393         self.forward_sel      = Signal(8)
 394
 395         # Cache miss state (reload state machine)
 396         self.state            = Signal(State)
 397         self.dcbz             = Signal()
 398         self.write_bram       = Signal()
 399         self.write_tag        = Signal()
 400         self.slow_valid       = Signal()
 401         self.wb               = WBMasterOut("wb")
 402         self.reload_tag       = Signal(TAG_BITS)
 403         self.store_way        = Signal(WAY_BITS)
 404         self.store_row        = Signal(ROW_BITS)
 405         self.store_index      = Signal(INDEX_BITS)
 406         self.end_row_ix       = Signal(ROW_LINE_BITS)
 407         self.rows_valid       = RowPerLineValidArray()
 408         self.acks_pending     = Signal(3)
 409         self.inc_acks         = Signal()
 410         self.dec_acks         = Signal()
 411
 412         # Signals to complete (possibly with error)
 413         self.ls_valid         = Signal()
 414         self.ls_error         = Signal()
 415         self.mmu_done         = Signal()
 416         self.mmu_error        = Signal()
 417         self.cache_paradox    = Signal()
 418
 419         # Signal to complete a failed stcx.
 420         self.stcx_fail        = Signal()
 421
 422
 423 # Reservation information
 424 class Reservation(RecordObject):
 425     def __init__(self):
 426         super().__init__()
 427         self.valid = Signal()
 428         self.addr  = Signal(64-LINE_OFF_BITS)
 429
 430
 431 class DTLBUpdate(Elaboratable):
 432     def __init__(self):
 433         self.tlbie    = Signal()
 434         self.tlbwe    = Signal()
 435         self.doall    = Signal()
 436         self.updated  = Signal()
 437         self.v_updated  = Signal()
 438         self.tlb_hit     = TLBHit("tlb_hit")
 439         self.tlb_req_index = Signal(TLB_SET_BITS)
 440
 441         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 442         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 443         self.repl_way        = Signal(TLB_WAY_BITS)
 444         self.eatag           = Signal(TLB_EA_TAG_BITS)
 445         self.pte_data        = Signal(TLB_PTE_BITS)
 446
 447         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 448
 449         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 450         self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 451         self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 452
 453     def elaborate(self, platform):
 454         m = Module()
 455         comb = m.d.comb
 456         sync = m.d.sync
 457
 458         tagset   = Signal(TLB_TAG_WAY_BITS)
 459         pteset   = Signal(TLB_PTE_WAY_BITS)
 460
 461         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 462         comb += db_out.eq(self.dv)
 463
 464         with m.If(self.tlbie & self.doall):
 465             pass # clear all back in parent
 466         with m.Elif(self.tlbie):
 467             with m.If(self.tlb_hit.valid):
 468                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 469                 comb += self.v_updated.eq(1)
 470
 471         with m.Elif(self.tlbwe):
 472
 473             comb += tagset.eq(self.tlb_tag_way)
 474             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 475             comb += tb_out.eq(tagset)
 476
 477             comb += pteset.eq(self.tlb_pte_way)
 478             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 479             comb += pb_out.eq(pteset)
 480
 481             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 482
 483             comb += self.updated.eq(1)
 484             comb += self.v_updated.eq(1)
 485
 486         return m
 487
 488
 489 class DCachePendingHit(Elaboratable):
 490
 491     def __init__(self, tlb_way,
 492                       cache_i_validdx, cache_tag_set,
 493                     req_addr,
 494                     hit_set):
 495
 496         self.go          = Signal()
 497         self.virt_mode   = Signal()
 498         self.is_hit      = Signal()
 499         self.tlb_hit      = TLBHit("tlb_hit")
 500         self.hit_way     = Signal(WAY_BITS)
 501         self.rel_match   = Signal()
 502         self.req_index   = Signal(INDEX_BITS)
 503         self.reload_tag  = Signal(TAG_BITS)
 504
 505         self.tlb_way = tlb_way
 506         self.cache_i_validdx = cache_i_validdx
 507         self.cache_tag_set = cache_tag_set
 508         self.req_addr = req_addr
 509         self.hit_set = hit_set
 510
 511     def elaborate(self, platform):
 512         m = Module()
 513         comb = m.d.comb
 514         sync = m.d.sync
 515
 516         go = self.go
 517         virt_mode = self.virt_mode
 518         is_hit = self.is_hit
 519         tlb_way = self.tlb_way
 520         cache_i_validdx = self.cache_i_validdx
 521         cache_tag_set = self.cache_tag_set
 522         req_addr = self.req_addr
 523         tlb_hit = self.tlb_hit
 524         hit_set = self.hit_set
 525         hit_way = self.hit_way
 526         rel_match = self.rel_match
 527         req_index = self.req_index
 528         reload_tag = self.reload_tag
 529
 530         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 531                                     for i in range(TLB_NUM_WAYS))
 532         hit_way_set = HitWaySet()
 533
 534         # Test if pending request is a hit on any way
 535         # In order to make timing in virtual mode,
 536         # when we are using the TLB, we compare each
 537         # way with each of the real addresses from each way of
 538         # the TLB, and then decide later which match to use.
 539
 540         with m.If(virt_mode):
 541             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 542                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 543                 s_hit       = Signal()
 544                 s_pte       = Signal(TLB_PTE_BITS)
 545                 s_ra        = Signal(REAL_ADDR_BITS)
 546                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 547                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 548                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 549                 comb += s_tag.eq(get_tag(s_ra))
 550
 551                 for i in range(NUM_WAYS): # way_t
 552                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 553                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 554                                   (read_tag(i, cache_tag_set) == s_tag)
 555                                   & (tlb_way.valid[j]))
 556                     with m.If(is_tag_hit):
 557                         comb += hit_way_set[j].eq(i)
 558                         comb += s_hit.eq(1)
 559                 comb += hit_set[j].eq(s_hit)
 560                 with m.If(s_tag == reload_tag):
 561                     comb += rel_matches[j].eq(1)
 562             with m.If(tlb_hit.way):
 563                 comb += is_hit.eq(hit_set[tlb_hit.way])
 564                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 565                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 566         with m.Else():
 567             s_tag       = Signal(TAG_BITS)
 568             comb += s_tag.eq(get_tag(req_addr))
 569             for i in range(NUM_WAYS): # way_t
 570                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 571                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 572                           (read_tag(i, cache_tag_set) == s_tag))
 573                 with m.If(is_tag_hit):
 574                     comb += hit_way.eq(i)
 575                     comb += is_hit.eq(1)
 576             with m.If(s_tag == reload_tag):
 577                 comb += rel_match.eq(1)
 578
 579         return m
 580
 581
 582 class DCache(Elaboratable):
 583     """Set associative dcache write-through
 584
 585     TODO (in no specific order):
 586     * See list in icache.vhdl
 587     * Complete load misses on the cycle when WB data comes instead of
 588       at the end of line (this requires dealing with requests coming in
 589       while not idle...)
 590     """
 591     def __init__(self):
 592         self.d_in      = LoadStore1ToDCacheType("d_in")
 593         self.d_out     = DCacheToLoadStore1Type("d_out")
 594
 595         self.m_in      = MMUToDCacheType("m_in")
 596         self.m_out     = DCacheToMMUType("m_out")
 597
 598         self.stall_out = Signal()
 599
 600         # standard naming (wired to non-standard for compatibility)
 601         self.bus = Interface(addr_width=32,
 602                             data_width=64,
 603                             granularity=8,
 604                             features={'stall'},
 605                             alignment=0,
 606                             name="dcache")
 607
 608         self.log_out   = Signal(20)
 609
 610     def stage_0(self, m, r0, r1, r0_full):
 611         """Latch the request in r0.req as long as we're not stalling
 612         """
 613         comb = m.d.comb
 614         sync = m.d.sync
 615         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 616
 617         r = RegStage0("stage0")
 618
 619         # TODO, this goes in unit tests and formal proofs
 620         with m.If(d_in.valid & m_in.valid):
 621             sync += Display("request collision loadstore vs MMU")
 622
 623         with m.If(m_in.valid):
 624             comb += r.req.valid.eq(1)
 625             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 626             comb += r.req.dcbz.eq(0)
 627             comb += r.req.nc.eq(0)
 628             comb += r.req.reserve.eq(0)
 629             comb += r.req.virt_mode.eq(0)
 630             comb += r.req.priv_mode.eq(1)
 631             comb += r.req.addr.eq(m_in.addr)
 632             comb += r.req.data.eq(m_in.pte)
 633             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 634             comb += r.tlbie.eq(m_in.tlbie)
 635             comb += r.doall.eq(m_in.doall)
 636             comb += r.tlbld.eq(m_in.tlbld)
 637             comb += r.mmu_req.eq(1)
 638             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 639                                  m_in.addr, m_in.pte, r.req.load)
 640
 641         with m.Else():
 642             comb += r.req.eq(d_in)
 643             comb += r.req.data.eq(0)
 644             comb += r.tlbie.eq(0)
 645             comb += r.doall.eq(0)
 646             comb += r.tlbld.eq(0)
 647             comb += r.mmu_req.eq(0)
 648         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 649             sync += r0.eq(r)
 650             sync += r0_full.eq(r.req.valid)
 651             # Sample data the cycle after a request comes in from loadstore1.
 652             # If another request has come in already then the data will get
 653             # put directly into req.data below.
 654             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 655                      ~r0.mmu_req):
 656                 sync += r0.req.data.eq(d_in.data)
 657                 sync += r0.d_valid.eq(1)
 658         with m.If(d_in.valid):
 659             m.d.sync += Display("    DCACHE req cache "
 660                                 "virt %d addr %x data %x ld %d",
 661                                  r.req.virt_mode, r.req.addr,
 662                                  r.req.data, r.req.load)
 663
 664     def tlb_read(self, m, r0_stall, tlb_way, dtlb):
 665         """TLB
 666         Operates in the second cycle on the request latched in r0.req.
 667         TLB updates write the entry at the end of the second cycle.
 668         """
 669         comb = m.d.comb
 670         sync = m.d.sync
 671         m_in, d_in = self.m_in, self.d_in
 672
 673         index    = Signal(TLB_SET_BITS)
 674         addrbits = Signal(TLB_SET_BITS)
 675
 676         amin = TLB_LG_PGSZ
 677         amax = TLB_LG_PGSZ + TLB_SET_BITS
 678
 679         with m.If(m_in.valid):
 680             comb += addrbits.eq(m_in.addr[amin : amax])
 681         with m.Else():
 682             comb += addrbits.eq(d_in.addr[amin : amax])
 683         comb += index.eq(addrbits)
 684
 685         # If we have any op and the previous op isn't finished,
 686         # then keep the same output for next cycle.
 687         with m.If(~r0_stall):
 688             sync += tlb_way.eq(dtlb[index])
 689
 690     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 691         """Generate TLB PLRUs
 692         """
 693         comb = m.d.comb
 694         sync = m.d.sync
 695
 696         if TLB_NUM_WAYS == 0:
 697             return
 698
 699         # XXX TODO: use a Binary-to-Unary Encoder here
 700         tlb_hit_onehot = Signal(TLB_SET_SIZE)
 701         with m.If(r1.tlb_hit.valid):
 702             comb += tlb_hit_onehot.eq(1<<r1.tlb_hit_index)
 703
 704         for i in range(TLB_SET_SIZE):
 705             # TLB PLRU interface
 706             tlb_plru        = PLRU(TLB_WAY_BITS)
 707             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 708             tlb_plru_acc_en = Signal()
 709
 710             comb += tlb_plru_acc_en.eq(tlb_hit_onehot[i])
 711             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 712             comb += tlb_plru.acc_i.eq(r1.tlb_hit.way)
 713             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 714
 715     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 716                    tlb_way,
 717                    pte, tlb_hit, valid_ra, perm_attr, ra):
 718
 719         comb = m.d.comb
 720
 721         hitway = Signal(TLB_WAY_BITS)
 722         hit    = Signal()
 723         eatag  = Signal(TLB_EA_TAG_BITS)
 724
 725         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 726         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 727         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 728
 729         for i in range(TLB_NUM_WAYS):
 730             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 731             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 732             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 733             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 734             with m.If(is_tag_hit):
 735                 comb += hitway.eq(i)
 736                 comb += hit.eq(1)
 737
 738         comb += tlb_hit.valid.eq(hit & r0_valid)
 739         comb += tlb_hit.way.eq(hitway)
 740
 741         with m.If(tlb_hit.valid):
 742             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 743         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 744
 745         with m.If(r0.req.virt_mode):
 746             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 747                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 748                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 749             comb += perm_attr.reference.eq(pte[8])
 750             comb += perm_attr.changed.eq(pte[7])
 751             comb += perm_attr.nocache.eq(pte[5])
 752             comb += perm_attr.priv.eq(pte[3])
 753             comb += perm_attr.rd_perm.eq(pte[2])
 754             comb += perm_attr.wr_perm.eq(pte[1])
 755         with m.Else():
 756             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 757                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 758             comb += perm_attr.reference.eq(1)
 759             comb += perm_attr.changed.eq(1)
 760             comb += perm_attr.nocache.eq(0)
 761             comb += perm_attr.priv.eq(1)
 762             comb += perm_attr.rd_perm.eq(1)
 763             comb += perm_attr.wr_perm.eq(1)
 764
 765         with m.If(valid_ra):
 766             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 767                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 768             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 769             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 770             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 771             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 772             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 773             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 774
 775     def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
 776                     tlb_hit, tlb_plru_victim, tlb_way):
 777
 778         comb = m.d.comb
 779         sync = m.d.sync
 780
 781         tlbie    = Signal()
 782         tlbwe    = Signal()
 783
 784         comb += tlbie.eq(r0_valid & r0.tlbie)
 785         comb += tlbwe.eq(r0_valid & r0.tlbld)
 786
 787         m.submodules.tlb_update = d = DTLBUpdate()
 788         with m.If(tlbie & r0.doall):
 789             # clear all valid bits at once
 790             for i in range(TLB_SET_SIZE):
 791                 sync += dtlb[i].valid.eq(0)
 792         with m.If(d.updated):
 793             sync += dtlb[tlb_req_index].tag.eq(d.tb_out)
 794             sync += dtlb[tlb_req_index].pte.eq(d.pb_out)
 795         with m.If(d.v_updated):
 796             sync += dtlb[tlb_req_index].valid.eq(d.db_out)
 797
 798         comb += d.dv.eq(dtlb[tlb_req_index].valid)
 799
 800         comb += d.tlbie.eq(tlbie)
 801         comb += d.tlbwe.eq(tlbwe)
 802         comb += d.doall.eq(r0.doall)
 803         comb += d.tlb_hit.eq(tlb_hit)
 804         comb += d.tlb_tag_way.eq(tlb_way.tag)
 805         comb += d.tlb_pte_way.eq(tlb_way.pte)
 806         comb += d.tlb_req_index.eq(tlb_req_index)
 807
 808         with m.If(tlb_hit.valid):
 809             comb += d.repl_way.eq(tlb_hit.way)
 810         with m.Else():
 811             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 812         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 813         comb += d.pte_data.eq(r0.req.data)
 814
 815     def maybe_plrus(self, m, r1, plru_victim):
 816         """Generate PLRUs
 817         """
 818         comb = m.d.comb
 819         sync = m.d.sync
 820
 821         if TLB_NUM_WAYS == 0:
 822             return
 823
 824         # XXX TODO: use a Binary-to-Unary Encoder here
 825         hit_onehot = Signal(NUM_LINES)
 826         with m.If(r1.cache_hit):
 827             comb += hit_onehot.eq(1<<r1.hit_index)
 828
 829         for i in range(NUM_LINES):
 830             # PLRU interface
 831             plru        = PLRU(WAY_BITS)
 832             setattr(m.submodules, "plru%d" % i, plru)
 833             plru_acc_en = Signal()
 834
 835             comb += plru_acc_en.eq(hit_onehot[i])
 836             comb += plru.acc_en.eq(plru_acc_en)
 837             comb += plru.acc_i.eq(r1.hit_way)
 838             comb += plru_victim[i].eq(plru.lru_o)
 839
 840     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 841         """Cache tag RAM read port
 842         """
 843         comb = m.d.comb
 844         sync = m.d.sync
 845         m_in, d_in = self.m_in, self.d_in
 846
 847         index = Signal(INDEX_BITS)
 848
 849         with m.If(r0_stall):
 850             comb += index.eq(req_index)
 851         with m.Elif(m_in.valid):
 852             comb += index.eq(get_index(m_in.addr))
 853         with m.Else():
 854             comb += index.eq(get_index(d_in.addr))
 855         sync += cache_tag_set.eq(cache_tags[index].tag)
 856
 857     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 858                        r0_valid, r1, cache_tags, replace_way,
 859                        use_forward1_next, use_forward2_next,
 860                        req_hit_way, plru_victim, rc_ok, perm_attr,
 861                        valid_ra, perm_ok, access_ok, req_op, req_go,
 862                        tlb_hit, tlb_way, cache_tag_set,
 863                        cancel_store, req_same_tag, r0_stall, early_req_row):
 864         """Cache request parsing and hit detection
 865         """
 866
 867         comb = m.d.comb
 868         m_in, d_in = self.m_in, self.d_in
 869
 870         is_hit      = Signal()
 871         hit_way     = Signal(WAY_BITS)
 872         op          = Signal(Op)
 873         opsel       = Signal(3)
 874         go          = Signal()
 875         nc          = Signal()
 876         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 877                                   for i in range(TLB_NUM_WAYS))
 878         cache_i_validdx = Signal(NUM_WAYS)
 879
 880         # Extract line, row and tag from request
 881         comb += req_index.eq(get_index(r0.req.addr))
 882         comb += req_row.eq(get_row(r0.req.addr))
 883         comb += req_tag.eq(get_tag(ra))
 884
 885         if False: # display on comb is a bit... busy.
 886             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 887                     r0.req.addr, ra, req_index, req_tag, req_row)
 888
 889         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 890         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 891
 892         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 893                                             cache_i_validdx, cache_tag_set,
 894                                             r0.req.addr,
 895                                             hit_set)
 896         comb += dc.tlb_hit.eq(tlb_hit)
 897         comb += dc.reload_tag.eq(r1.reload_tag)
 898         comb += dc.virt_mode.eq(r0.req.virt_mode)
 899         comb += dc.go.eq(go)
 900         comb += dc.req_index.eq(req_index)
 901
 902         comb += is_hit.eq(dc.is_hit)
 903         comb += hit_way.eq(dc.hit_way)
 904         comb += req_same_tag.eq(dc.rel_match)
 905
 906         # See if the request matches the line currently being reloaded
 907         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 908                   (req_index == r1.store_index) & req_same_tag):
 909             # For a store, consider this a hit even if the row isn't
 910             # valid since it will be by the time we perform the store.
 911             # For a load, check the appropriate row valid bit.
 912             rrow = Signal(ROW_LINE_BITS)
 913             comb += rrow.eq(req_row)
 914             valid = r1.rows_valid[rrow]
 915             comb += is_hit.eq((~r0.req.load) | valid)
 916             comb += hit_way.eq(replace_way)
 917
 918         # Whether to use forwarded data for a load or not
 919         with m.If((get_row(r1.req.real_addr) == req_row) &
 920                   (r1.req.hit_way == hit_way)):
 921             # Only need to consider r1.write_bram here, since if we
 922             # are writing refill data here, then we don't have a
 923             # cache hit this cycle on the line being refilled.
 924             # (There is the possibility that the load following the
 925             # load miss that started the refill could be to the old
 926             # contents of the victim line, since it is a couple of
 927             # cycles after the refill starts before we see the updated
 928             # cache tag. In that case we don't use the bypass.)
 929             comb += use_forward1_next.eq(r1.write_bram)
 930         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 931             comb += use_forward2_next.eq(r1.forward_valid1)
 932
 933         # The way that matched on a hit
 934         comb += req_hit_way.eq(hit_way)
 935
 936         # The way to replace on a miss
 937         with m.If(r1.write_tag):
 938             comb += replace_way.eq(plru_victim[r1.store_index])
 939         with m.Else():
 940             comb += replace_way.eq(r1.store_way)
 941
 942         # work out whether we have permission for this access
 943         # NB we don't yet implement AMR, thus no KUAP
 944         comb += rc_ok.eq(perm_attr.reference
 945                          & (r0.req.load | perm_attr.changed))
 946         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 947                            (perm_attr.wr_perm |
 948                               (r0.req.load & perm_attr.rd_perm)))
 949         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 950
 951         # Combine the request and cache hit status to decide what
 952         # operation needs to be done
 953         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 954         comb += op.eq(Op.OP_NONE)
 955         with m.If(go):
 956             with m.If(~access_ok):
 957                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
 958                                  valid_ra, perm_ok, rc_ok)
 959                 comb += op.eq(Op.OP_BAD)
 960             with m.Elif(cancel_store):
 961                 m.d.sync += Display("DCACHE cancel store")
 962                 comb += op.eq(Op.OP_STCX_FAIL)
 963             with m.Else():
 964                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
 965                                  valid_ra, nc, r0.req.load)
 966                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 967                 with m.Switch(opsel):
 968                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 969                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 970                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 971                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 972                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 973                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 974                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 975                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 976         comb += req_op.eq(op)
 977         comb += req_go.eq(go)
 978
 979         # Version of the row number that is valid one cycle earlier
 980         # in the cases where we need to read the cache data BRAM.
 981         # If we're stalling then we need to keep reading the last
 982         # row requested.
 983         with m.If(~r0_stall):
 984             with m.If(m_in.valid):
 985                 comb += early_req_row.eq(get_row(m_in.addr))
 986             with m.Else():
 987                 comb += early_req_row.eq(get_row(d_in.addr))
 988         with m.Else():
 989             comb += early_req_row.eq(req_row)
 990
 991     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 992                          r0_valid, r0, reservation):
 993         """Handle load-with-reservation and store-conditional instructions
 994         """
 995         comb = m.d.comb
 996
 997         with m.If(r0_valid & r0.req.reserve):
 998             # XXX generate alignment interrupt if address
 999             # is not aligned XXX or if r0.req.nc = '1'
1000             with m.If(r0.req.load):
1001                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1002             with m.Else():
1003                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1004                 with m.If((~reservation.valid) |
1005                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1006                     comb += cancel_store.eq(1)
1007
1008     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1009                         reservation, r0):
1010         comb = m.d.comb
1011         sync = m.d.sync
1012
1013         with m.If(r0_valid & access_ok):
1014             with m.If(clear_rsrv):
1015                 sync += reservation.valid.eq(0)
1016             with m.Elif(set_rsrv):
1017                 sync += reservation.valid.eq(1)
1018                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1019
1020     def writeback_control(self, m, r1, cache_out_row):
1021         """Return data for loads & completion control logic
1022         """
1023         comb = m.d.comb
1024         sync = m.d.sync
1025         d_out, m_out = self.d_out, self.m_out
1026
1027         data_out = Signal(64)
1028         data_fwd = Signal(64)
1029
1030         # Use the bypass if are reading the row that was
1031         # written 1 or 2 cycles ago, including for the
1032         # slow_valid = 1 case (i.e. completing a load
1033         # miss or a non-cacheable load).
1034         with m.If(r1.use_forward1):
1035             comb += data_fwd.eq(r1.forward_data1)
1036         with m.Else():
1037             comb += data_fwd.eq(r1.forward_data2)
1038
1039         comb += data_out.eq(cache_out_row)
1040
1041         for i in range(8):
1042             with m.If(r1.forward_sel[i]):
1043                 dsel = data_fwd.word_select(i, 8)
1044                 comb += data_out.word_select(i, 8).eq(dsel)
1045
1046         # DCache output to LoadStore
1047         comb += d_out.valid.eq(r1.ls_valid)
1048         comb += d_out.data.eq(data_out)
1049         comb += d_out.store_done.eq(~r1.stcx_fail)
1050         comb += d_out.error.eq(r1.ls_error)
1051         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1052
1053         # Outputs to MMU
1054         comb += m_out.done.eq(r1.mmu_done)
1055         comb += m_out.err.eq(r1.mmu_error)
1056         comb += m_out.data.eq(data_out)
1057
1058         # We have a valid load or store hit or we just completed
1059         # a slow op such as a load miss, a NC load or a store
1060         #
1061         # Note: the load hit is delayed by one cycle. However it
1062         # can still not collide with r.slow_valid (well unless I
1063         # miscalculated) because slow_valid can only be set on a
1064         # subsequent request and not on its first cycle (the state
1065         # machine must have advanced), which makes slow_valid
1066         # at least 2 cycles from the previous hit_load_valid.
1067
1068         # Sanity: Only one of these must be set in any given cycle
1069
1070         if False: # TODO: need Display to get this to work
1071             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1072             "unexpected slow_valid collision with stcx_fail"
1073
1074             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1075              "unexpected hit_load_delayed collision with slow_valid"
1076
1077         with m.If(~r1.mmu_req):
1078             # Request came from loadstore1...
1079             # Load hit case is the standard path
1080             with m.If(r1.hit_load_valid):
1081                 sync += Display("completing load hit data=%x", data_out)
1082
1083             # error cases complete without stalling
1084             with m.If(r1.ls_error):
1085                 with m.If(r1.dcbz):
1086                     sync += Display("completing dcbz with error")
1087                 with m.Else():
1088                     sync += Display("completing ld/st with error")
1089
1090             # Slow ops (load miss, NC, stores)
1091             with m.If(r1.slow_valid):
1092                 sync += Display("completing store or load miss adr=%x data=%x",
1093                                 r1.req.real_addr, data_out)
1094
1095         with m.Else():
1096             # Request came from MMU
1097             with m.If(r1.hit_load_valid):
1098                 sync += Display("completing load hit to MMU, data=%x",
1099                                 m_out.data)
1100             # error cases complete without stalling
1101             with m.If(r1.mmu_error):
1102                 sync += Display("combpleting MMU ld with error")
1103
1104             # Slow ops (i.e. load miss)
1105             with m.If(r1.slow_valid):
1106                 sync += Display("completing MMU load miss, adr=%x data=%x",
1107                                 r1.req.real_addr, m_out.data)
1108
1109     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1110         """rams
1111         Generate a cache RAM for each way. This handles the normal
1112         reads, writes from reloads and the special store-hit update
1113         path as well.
1114
1115         Note: the BRAMs have an extra read buffer, meaning the output
1116         is pipelined an extra cycle. This differs from the
1117         icache. The writeback logic needs to take that into
1118         account by using 1-cycle delayed signals for load hits.
1119         """
1120         comb = m.d.comb
1121         bus = self.bus
1122
1123         # XXX TODO: use a Binary-to-Unary Encoder here
1124         hit_way_onehot = Signal(NUM_WAYS)
1125         replace_way_onehot = Signal(NUM_WAYS)
1126         hit_req_way_onehot = Signal(NUM_WAYS)
1127         comb += hit_way_onehot.eq(1<<r1.hit_way)
1128         comb += hit_req_way_onehot.eq(1<<r1.req.hit_way)
1129         comb += replace_way_onehot.eq(1<<replace_way)
1130
1131         for i in range(NUM_WAYS):
1132             do_read  = Signal(name="do_rd%d" % i)
1133             rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
1134             do_write = Signal(name="do_wr%d" % i)
1135             wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
1136             wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
1137             wr_sel   = Signal(ROW_SIZE)
1138             wr_sel_m = Signal(ROW_SIZE)
1139             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1140
1141             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1142             setattr(m.submodules, "cacheram_%d" % i, way)
1143
1144             comb += way.rd_en.eq(do_read)
1145             comb += way.rd_addr.eq(rd_addr)
1146             comb += _d_out.eq(way.rd_data_o)
1147             comb += way.wr_sel.eq(wr_sel_m)
1148             comb += way.wr_addr.eq(wr_addr)
1149             comb += way.wr_data.eq(wr_data)
1150
1151             # Cache hit reads
1152             comb += do_read.eq(1)
1153             comb += rd_addr.eq(early_req_row)
1154             with m.If(hit_way_onehot[i]):
1155                 comb += cache_out_row.eq(_d_out)
1156
1157             # Write mux:
1158             #
1159             # Defaults to wishbone read responses (cache refill)
1160             #
1161             # For timing, the mux on wr_data/sel/addr is not
1162             # dependent on anything other than the current state.
1163
1164             with m.If(r1.write_bram):
1165                 # Write store data to BRAM.  This happens one
1166                 # cycle after the store is in r0.
1167                 comb += wr_data.eq(r1.req.data)
1168                 comb += wr_sel.eq(r1.req.byte_sel)
1169                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1170
1171                 comb += do_write.eq(hit_req_way_onehot[i])
1172             with m.Else():
1173                 # Otherwise, we might be doing a reload or a DCBZ
1174                 with m.If(r1.dcbz):
1175                     comb += wr_data.eq(0)
1176                 with m.Else():
1177                     comb += wr_data.eq(bus.dat_r)
1178                 comb += wr_addr.eq(r1.store_row)
1179                 comb += wr_sel.eq(~0) # all 1s
1180
1181                 with m.If((r1.state == State.RELOAD_WAIT_ACK) &
1182                            bus.ack & replace_way_onehot[i]):
1183                     comb += do_write.eq(1)
1184
1185             # Mask write selects with do_write since BRAM
1186             # doesn't have a global write-enable
1187             with m.If(do_write):
1188                 comb += wr_sel_m.eq(wr_sel)
1189
1190     # Cache hit synchronous machine for the easy case.
1191     # This handles load hits.
1192     # It also handles error cases (TLB miss, cache paradox)
1193     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1194                         req_hit_way, req_index, req_tag, access_ok,
1195                         tlb_hit, tlb_req_index):
1196         comb = m.d.comb
1197         sync = m.d.sync
1198
1199         with m.If(req_op != Op.OP_NONE):
1200             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1201                     req_op, r0.req.addr, r0.req.nc,
1202                     req_index, req_tag, req_hit_way)
1203
1204         with m.If(r0_valid):
1205             sync += r1.mmu_req.eq(r0.mmu_req)
1206
1207         # Fast path for load/store hits.
1208         # Set signals for the writeback controls.
1209         sync += r1.hit_way.eq(req_hit_way)
1210         sync += r1.hit_index.eq(req_index)
1211
1212         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1213         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1214                                 (req_op == Op.OP_STORE_HIT))
1215
1216         with m.If(req_op == Op.OP_BAD):
1217             sync += Display("Signalling ld/st error "
1218                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1219                             ~r0.mmu_req,r0.mmu_req,access_ok)
1220             sync += r1.ls_error.eq(~r0.mmu_req)
1221             sync += r1.mmu_error.eq(r0.mmu_req)
1222             sync += r1.cache_paradox.eq(access_ok)
1223         with m.Else():
1224             sync += r1.ls_error.eq(0)
1225             sync += r1.mmu_error.eq(0)
1226             sync += r1.cache_paradox.eq(0)
1227
1228         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1229
1230         # Record TLB hit information for updating TLB PLRU
1231         sync += r1.tlb_hit.eq(tlb_hit)
1232         sync += r1.tlb_hit_index.eq(tlb_req_index)
1233
1234     # Memory accesses are handled by this state machine:
1235     #
1236     #   * Cache load miss/reload (in conjunction with "rams")
1237     #   * Load hits for non-cachable forms
1238     #   * Stores (the collision case is handled in "rams")
1239     #
1240     # All wishbone requests generation is done here.
1241     # This machine operates at stage 1.
1242     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1243                     r0, replace_way,
1244                     req_hit_way, req_same_tag,
1245                     r0_valid, req_op, cache_tags, req_go, ra):
1246
1247         comb = m.d.comb
1248         sync = m.d.sync
1249         bus = self.bus
1250         d_in = self.d_in
1251
1252         req         = MemAccessRequest("mreq_ds")
1253
1254         req_row = Signal(ROW_BITS)
1255         req_idx = Signal(INDEX_BITS)
1256         req_tag = Signal(TAG_BITS)
1257         comb += req_idx.eq(get_index(req.real_addr))
1258         comb += req_row.eq(get_row(req.real_addr))
1259         comb += req_tag.eq(get_tag(req.real_addr))
1260
1261         sync += r1.use_forward1.eq(use_forward1_next)
1262         sync += r1.forward_sel.eq(0)
1263
1264         with m.If(use_forward1_next):
1265             sync += r1.forward_sel.eq(r1.req.byte_sel)
1266         with m.Elif(use_forward2_next):
1267             sync += r1.forward_sel.eq(r1.forward_sel1)
1268
1269         sync += r1.forward_data2.eq(r1.forward_data1)
1270         with m.If(r1.write_bram):
1271             sync += r1.forward_data1.eq(r1.req.data)
1272             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1273             sync += r1.forward_way1.eq(r1.req.hit_way)
1274             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1275             sync += r1.forward_valid1.eq(1)
1276         with m.Else():
1277             with m.If(r1.dcbz):
1278                 sync += r1.forward_data1.eq(0)
1279             with m.Else():
1280                 sync += r1.forward_data1.eq(bus.dat_r)
1281             sync += r1.forward_sel1.eq(~0) # all 1s
1282             sync += r1.forward_way1.eq(replace_way)
1283             sync += r1.forward_row1.eq(r1.store_row)
1284             sync += r1.forward_valid1.eq(0)
1285
1286         # One cycle pulses reset
1287         sync += r1.slow_valid.eq(0)
1288         sync += r1.write_bram.eq(0)
1289         sync += r1.inc_acks.eq(0)
1290         sync += r1.dec_acks.eq(0)
1291
1292         sync += r1.ls_valid.eq(0)
1293         # complete tlbies and TLB loads in the third cycle
1294         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1295
1296         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1297             with m.If(~r0.mmu_req):
1298                 sync += r1.ls_valid.eq(1)
1299             with m.Else():
1300                 sync += r1.mmu_done.eq(1)
1301
1302         with m.If(r1.write_tag):
1303             # Store new tag in selected way
1304             replace_way_onehot = Signal(NUM_WAYS)
1305             comb += replace_way_onehot.eq(1<<replace_way)
1306             for i in range(NUM_WAYS):
1307                 with m.If(replace_way_onehot[i]):
1308                     ct = Signal(TAG_RAM_WIDTH)
1309                     comb += ct.eq(cache_tags[r1.store_index].tag)
1310                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1311                     sync += cache_tags[r1.store_index].tag.eq(ct)
1312             sync += r1.store_way.eq(replace_way)
1313             sync += r1.write_tag.eq(0)
1314
1315         # Take request from r1.req if there is one there,
1316         # else from req_op, ra, etc.
1317         with m.If(r1.full):
1318             comb += req.eq(r1.req)
1319         with m.Else():
1320             comb += req.op.eq(req_op)
1321             comb += req.valid.eq(req_go)
1322             comb += req.mmu_req.eq(r0.mmu_req)
1323             comb += req.dcbz.eq(r0.req.dcbz)
1324             comb += req.real_addr.eq(ra)
1325
1326             with m.If(r0.req.dcbz):
1327                 # force data to 0 for dcbz
1328                 comb += req.data.eq(0)
1329             with m.Elif(r0.d_valid):
1330                 comb += req.data.eq(r0.req.data)
1331             with m.Else():
1332                 comb += req.data.eq(d_in.data)
1333
1334             # Select all bytes for dcbz
1335             # and for cacheable loads
1336             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1337                 comb += req.byte_sel.eq(~0) # all 1s
1338             with m.Else():
1339                 comb += req.byte_sel.eq(r0.req.byte_sel)
1340             comb += req.hit_way.eq(req_hit_way)
1341             comb += req.same_tag.eq(req_same_tag)
1342
1343             # Store the incoming request from r0,
1344             # if it is a slow request
1345             # Note that r1.full = 1 implies req_op = OP_NONE
1346             with m.If((req_op == Op.OP_LOAD_MISS)
1347                       | (req_op == Op.OP_LOAD_NC)
1348                       | (req_op == Op.OP_STORE_MISS)
1349                       | (req_op == Op.OP_STORE_HIT)):
1350                 sync += r1.req.eq(req)
1351                 sync += r1.full.eq(1)
1352
1353         # Main state machine
1354         with m.Switch(r1.state):
1355
1356             with m.Case(State.IDLE):
1357                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1358                 sync += r1.wb.sel.eq(req.byte_sel)
1359                 sync += r1.wb.dat.eq(req.data)
1360                 sync += r1.dcbz.eq(req.dcbz)
1361
1362                 # Keep track of our index and way
1363                 # for subsequent stores.
1364                 sync += r1.store_index.eq(req_idx)
1365                 sync += r1.store_row.eq(req_row)
1366                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1367                 sync += r1.reload_tag.eq(req_tag)
1368                 sync += r1.req.same_tag.eq(1)
1369
1370                 with m.If(req.op == Op.OP_STORE_HIT):
1371                     sync += r1.store_way.eq(req.hit_way)
1372
1373                 # Reset per-row valid bits,
1374                 # ready for handling OP_LOAD_MISS
1375                 for i in range(ROW_PER_LINE):
1376                     sync += r1.rows_valid[i].eq(0)
1377
1378                 with m.If(req_op != Op.OP_NONE):
1379                     sync += Display("cache op %d", req.op)
1380
1381                 with m.Switch(req.op):
1382                     with m.Case(Op.OP_LOAD_HIT):
1383                         # stay in IDLE state
1384                         pass
1385
1386                     with m.Case(Op.OP_LOAD_MISS):
1387                         sync += Display("cache miss real addr: %x " \
1388                                 "idx: %x tag: %x",
1389                                 req.real_addr, req_row, req_tag)
1390
1391                         # Start the wishbone cycle
1392                         sync += r1.wb.we.eq(0)
1393                         sync += r1.wb.cyc.eq(1)
1394                         sync += r1.wb.stb.eq(1)
1395
1396                         # Track that we had one request sent
1397                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1398                         sync += r1.write_tag.eq(1)
1399
1400                     with m.Case(Op.OP_LOAD_NC):
1401                         sync += r1.wb.cyc.eq(1)
1402                         sync += r1.wb.stb.eq(1)
1403                         sync += r1.wb.we.eq(0)
1404                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1405
1406                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1407                         with m.If(~req.dcbz):
1408                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1409                             sync += r1.acks_pending.eq(1)
1410                             sync += r1.full.eq(0)
1411                             sync += r1.slow_valid.eq(1)
1412
1413                             with m.If(~req.mmu_req):
1414                                 sync += r1.ls_valid.eq(1)
1415                             with m.Else():
1416                                 sync += r1.mmu_done.eq(1)
1417
1418                             with m.If(req.op == Op.OP_STORE_HIT):
1419                                 sync += r1.write_bram.eq(1)
1420                         with m.Else():
1421                             # dcbz is handled much like a load miss except
1422                             # that we are writing to memory instead of reading
1423                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1424
1425                             with m.If(req.op == Op.OP_STORE_MISS):
1426                                 sync += r1.write_tag.eq(1)
1427
1428                         sync += r1.wb.we.eq(1)
1429                         sync += r1.wb.cyc.eq(1)
1430                         sync += r1.wb.stb.eq(1)
1431
1432                     # OP_NONE and OP_BAD do nothing
1433                     # OP_BAD & OP_STCX_FAIL were
1434                     # handled above already
1435                     with m.Case(Op.OP_NONE):
1436                         pass
1437                     with m.Case(Op.OP_BAD):
1438                         pass
1439                     with m.Case(Op.OP_STCX_FAIL):
1440                         pass
1441
1442             with m.Case(State.RELOAD_WAIT_ACK):
1443                 ld_stbs_done = Signal()
1444                 # Requests are all sent if stb is 0
1445                 comb += ld_stbs_done.eq(~r1.wb.stb)
1446
1447                 # If we are still sending requests, was one accepted?
1448                 with m.If((~bus.stall) & r1.wb.stb):
1449                     # That was the last word?  We are done sending.
1450                     # Clear stb and set ld_stbs_done so we can handle an
1451                     # eventual last ack on the same cycle.
1452                     # sigh - reconstruct wb adr with 3 extra 0s at front
1453                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1454                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1455                         sync += r1.wb.stb.eq(0)
1456                         comb += ld_stbs_done.eq(1)
1457
1458                     # Calculate the next row address in the current cache line
1459                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1460                     comb += row.eq(r1.wb.adr)
1461                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1462
1463                 # Incoming acks processing
1464                 sync += r1.forward_valid1.eq(bus.ack)
1465                 with m.If(bus.ack):
1466                     srow = Signal(ROW_LINE_BITS)
1467                     comb += srow.eq(r1.store_row)
1468                     sync += r1.rows_valid[srow].eq(1)
1469
1470                     # If this is the data we were looking for,
1471                     # we can complete the request next cycle.
1472                     # Compare the whole address in case the
1473                     # request in r1.req is not the one that
1474                     # started this refill.
1475                     with m.If(req.valid & r1.req.same_tag &
1476                               ((r1.dcbz & r1.req.dcbz) |
1477                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1478                                 (r1.store_row == get_row(req.real_addr))):
1479                         sync += r1.full.eq(0)
1480                         sync += r1.slow_valid.eq(1)
1481                         with m.If(~r1.mmu_req):
1482                             sync += r1.ls_valid.eq(1)
1483                         with m.Else():
1484                             sync += r1.mmu_done.eq(1)
1485                         sync += r1.forward_sel.eq(~0) # all 1s
1486                         sync += r1.use_forward1.eq(1)
1487
1488                     # Check for completion
1489                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1490                                                       r1.end_row_ix)):
1491                         # Complete wishbone cycle
1492                         sync += r1.wb.cyc.eq(0)
1493
1494                         # Cache line is now valid
1495                         cv = Signal(INDEX_BITS)
1496                         comb += cv.eq(cache_tags[r1.store_index].valid)
1497                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1498                         sync += cache_tags[r1.store_index].valid.eq(cv)
1499
1500                         sync += r1.state.eq(State.IDLE)
1501                         sync += Display("cache valid set %x "
1502                                         "idx %d way %d",
1503                                          cv, r1.store_index, r1.store_way)
1504
1505                     # Increment store row counter
1506                     sync += r1.store_row.eq(next_row(r1.store_row))
1507
1508             with m.Case(State.STORE_WAIT_ACK):
1509                 st_stbs_done = Signal()
1510                 acks        = Signal(3)
1511                 adjust_acks = Signal(3)
1512
1513                 comb += st_stbs_done.eq(~r1.wb.stb)
1514                 comb += acks.eq(r1.acks_pending)
1515
1516                 with m.If(r1.inc_acks != r1.dec_acks):
1517                     with m.If(r1.inc_acks):
1518                         comb += adjust_acks.eq(acks + 1)
1519                     with m.Else():
1520                         comb += adjust_acks.eq(acks - 1)
1521                 with m.Else():
1522                     comb += adjust_acks.eq(acks)
1523
1524                 sync += r1.acks_pending.eq(adjust_acks)
1525
1526                 # Clear stb when slave accepted request
1527                 with m.If(~bus.stall):
1528                     # See if there is another store waiting
1529                     # to be done which is in the same real page.
1530                     with m.If(req.valid):
1531                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1532                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1533                         sync += r1.wb.dat.eq(req.data)
1534                         sync += r1.wb.sel.eq(req.byte_sel)
1535
1536                     with m.If((adjust_acks < 7) & req.same_tag &
1537                                 ((req.op == Op.OP_STORE_MISS)
1538                                  | (req.op == Op.OP_STORE_HIT))):
1539                         sync += r1.wb.stb.eq(1)
1540                         comb += st_stbs_done.eq(0)
1541
1542                         with m.If(req.op == Op.OP_STORE_HIT):
1543                             sync += r1.write_bram.eq(1)
1544                         sync += r1.full.eq(0)
1545                         sync += r1.slow_valid.eq(1)
1546
1547                         # Store requests never come from the MMU
1548                         sync += r1.ls_valid.eq(1)
1549                         comb += st_stbs_done.eq(0)
1550                         sync += r1.inc_acks.eq(1)
1551                     with m.Else():
1552                         sync += r1.wb.stb.eq(0)
1553                         comb += st_stbs_done.eq(1)
1554
1555                 # Got ack ? See if complete.
1556                 with m.If(bus.ack):
1557                     with m.If(st_stbs_done & (adjust_acks == 1)):
1558                         sync += r1.state.eq(State.IDLE)
1559                         sync += r1.wb.cyc.eq(0)
1560                         sync += r1.wb.stb.eq(0)
1561                     sync += r1.dec_acks.eq(1)
1562
1563             with m.Case(State.NC_LOAD_WAIT_ACK):
1564                 # Clear stb when slave accepted request
1565                 with m.If(~bus.stall):
1566                     sync += r1.wb.stb.eq(0)
1567
1568                 # Got ack ? complete.
1569                 with m.If(bus.ack):
1570                     sync += r1.state.eq(State.IDLE)
1571                     sync += r1.full.eq(0)
1572                     sync += r1.slow_valid.eq(1)
1573
1574                     with m.If(~r1.mmu_req):
1575                         sync += r1.ls_valid.eq(1)
1576                     with m.Else():
1577                         sync += r1.mmu_done.eq(1)
1578
1579                     sync += r1.forward_sel.eq(~0) # all 1s
1580                     sync += r1.use_forward1.eq(1)
1581                     sync += r1.wb.cyc.eq(0)
1582                     sync += r1.wb.stb.eq(0)
1583
1584     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1585
1586         sync = m.d.sync
1587         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1588
1589         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1590                                stall_out, req_op[:3], d_out.valid, d_out.error,
1591                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1592                                r1.real_adr[3:6]))
1593
1594     def elaborate(self, platform):
1595
1596         m = Module()
1597         comb = m.d.comb
1598         d_in = self.d_in
1599
1600         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1601         cache_tags       = CacheTagArray()
1602         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1603
1604         # TODO attribute ram_style : string;
1605         # TODO attribute ram_style of cache_tags : signal is "distributed";
1606
1607         """note: these are passed to nmigen.hdl.Memory as "attributes".
1608            don't know how, just that they are.
1609         """
1610         dtlb            = TLBArray()
1611         # TODO attribute ram_style of
1612         #  dtlb_tags : signal is "distributed";
1613         # TODO attribute ram_style of
1614         #  dtlb_ptes : signal is "distributed";
1615
1616         r0      = RegStage0("r0")
1617         r0_full = Signal()
1618
1619         r1 = RegStage1("r1")
1620
1621         reservation = Reservation()
1622
1623         # Async signals on incoming request
1624         req_index    = Signal(INDEX_BITS)
1625         req_row      = Signal(ROW_BITS)
1626         req_hit_way  = Signal(WAY_BITS)
1627         req_tag      = Signal(TAG_BITS)
1628         req_op       = Signal(Op)
1629         req_data     = Signal(64)
1630         req_same_tag = Signal()
1631         req_go       = Signal()
1632
1633         early_req_row     = Signal(ROW_BITS)
1634
1635         cancel_store      = Signal()
1636         set_rsrv          = Signal()
1637         clear_rsrv        = Signal()
1638
1639         r0_valid          = Signal()
1640         r0_stall          = Signal()
1641
1642         use_forward1_next = Signal()
1643         use_forward2_next = Signal()
1644
1645         cache_out_row     = Signal(WB_DATA_BITS)
1646
1647         plru_victim       = PLRUOut()
1648         replace_way       = Signal(WAY_BITS)
1649
1650         # Wishbone read/write/cache write formatting signals
1651         bus_sel           = Signal(8)
1652
1653         # TLB signals
1654         tlb_way       = TLBRecord("tlb_way")
1655         tlb_req_index = Signal(TLB_SET_BITS)
1656         tlb_hit       = TLBHit("tlb_hit")
1657         pte           = Signal(TLB_PTE_BITS)
1658         ra            = Signal(REAL_ADDR_BITS)
1659         valid_ra      = Signal()
1660         perm_attr     = PermAttr("dc_perms")
1661         rc_ok         = Signal()
1662         perm_ok       = Signal()
1663         access_ok     = Signal()
1664
1665         tlb_plru_victim = TLBPLRUOut()
1666
1667         # we don't yet handle collisions between loadstore1 requests
1668         # and MMU requests
1669         comb += self.m_out.stall.eq(0)
1670
1671         # Hold off the request in r0 when r1 has an uncompleted request
1672         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1673         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1674         comb += self.stall_out.eq(r0_stall)
1675
1676         # deal with litex not doing wishbone pipeline mode
1677         # XXX in wrong way.  FIFOs are needed in the SRAM test
1678         # so that stb/ack match up. same thing done in icache.py
1679         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1680
1681         # Wire up wishbone request latch out of stage 1
1682         comb += self.bus.we.eq(r1.wb.we)
1683         comb += self.bus.adr.eq(r1.wb.adr)
1684         comb += self.bus.sel.eq(r1.wb.sel)
1685         comb += self.bus.stb.eq(r1.wb.stb)
1686         comb += self.bus.dat_w.eq(r1.wb.dat)
1687         comb += self.bus.cyc.eq(r1.wb.cyc)
1688
1689         # call sub-functions putting everything together, using shared
1690         # signals established above
1691         self.stage_0(m, r0, r1, r0_full)
1692         self.tlb_read(m, r0_stall, tlb_way, dtlb)
1693         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1694                         tlb_way,
1695                         pte, tlb_hit, valid_ra, perm_attr, ra)
1696         self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
1697                         tlb_hit, tlb_plru_victim,
1698                         tlb_way)
1699         self.maybe_plrus(m, r1, plru_victim)
1700         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1701         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1702         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1703                            r0_valid, r1, cache_tags, replace_way,
1704                            use_forward1_next, use_forward2_next,
1705                            req_hit_way, plru_victim, rc_ok, perm_attr,
1706                            valid_ra, perm_ok, access_ok, req_op, req_go,
1707                            tlb_hit, tlb_way, cache_tag_set,
1708                            cancel_store, req_same_tag, r0_stall, early_req_row)
1709         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1710                            r0_valid, r0, reservation)
1711         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1712                            reservation, r0)
1713         self.writeback_control(m, r1, cache_out_row)
1714         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1715         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1716                         req_hit_way, req_index, req_tag, access_ok,
1717                         tlb_hit, tlb_req_index)
1718         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1719                     r0, replace_way,
1720                     req_hit_way, req_same_tag,
1721                          r0_valid, req_op, cache_tags, req_go, ra)
1722         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1723
1724         return m
1725
1726
1727 if __name__ == '__main__':
1728     dut = DCache()
1729     vl = rtlil.convert(dut, ports=[])
1730     with open("test_dcache.il", "w") as f:
1731         f.write(vl)