src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record)
  29 from nmutil.util import Display
  30 from nmigen.lib.coding import Decoder
  31
  32 from copy import deepcopy
  33 from random import randint, seed
  34
  35 from nmigen_soc.wishbone.bus import Interface
  36
  37 from nmigen.cli import main
  38 from nmutil.iocontrol import RecordObject
  39 from nmigen.utils import log2_int
  40 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  41                                      DCacheToLoadStore1Type,
  42                                      MMUToDCacheType,
  43                                      DCacheToMMUType)
  44
  45 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  46                                 WBAddrType, WBDataType, WBSelType,
  47                                 WBMasterOut, WBSlaveOut,
  48                                 WBMasterOutVector, WBSlaveOutVector,
  49                                 WBIOMasterOut, WBIOSlaveOut)
  50
  51 from soc.experiment.cache_ram import CacheRam
  52 #from soc.experiment.plru import PLRU
  53 from nmutil.plru import PLRU
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmigen.cli import rtlil
  59
  60 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  61 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  62 from nmutil.sim_tmp_alternative import Simulator
  63
  64 from nmutil.util import wrap
  65
  66
  67 # TODO: make these parameters of DCache at some point
  68 LINE_SIZE = 64    # Line size in bytes
  69 NUM_LINES = 16    # Number of lines in a set
  70 NUM_WAYS = 4      # Number of ways
  71 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  72 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  73 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  74 LOG_LENGTH = 0    # Non-zero to enable log data collection
  75
  76 # BRAM organisation: We never access more than
  77 #     -- WB_DATA_BITS at a time so to save
  78 #     -- resources we make the array only that wide, and
  79 #     -- use consecutive indices to make a cache "line"
  80 #     --
  81 #     -- ROW_SIZE is the width in bytes of the BRAM
  82 #     -- (based on WB, so 64-bits)
  83 ROW_SIZE = WB_DATA_BITS // 8;
  84
  85 # ROW_PER_LINE is the number of row (wishbone
  86 # transactions) in a line
  87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  88
  89 # BRAM_ROWS is the number of rows in BRAM needed
  90 # to represent the full dcache
  91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  92
  93 print ("ROW_SIZE", ROW_SIZE)
  94 print ("ROW_PER_LINE", ROW_PER_LINE)
  95 print ("BRAM_ROWS", BRAM_ROWS)
  96 print ("NUM_WAYS", NUM_WAYS)
  97
  98 # Bit fields counts in the address
  99
 100 # REAL_ADDR_BITS is the number of real address
 101 # bits that we store
 102 REAL_ADDR_BITS = 56
 103
 104 # ROW_BITS is the number of bits to select a row
 105 ROW_BITS = log2_int(BRAM_ROWS)
 106
 107 # ROW_LINE_BITS is the number of bits to select
 108 # a row within a line
 109 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 110
 111 # LINE_OFF_BITS is the number of bits for
 112 # the offset in a cache line
 113 LINE_OFF_BITS = log2_int(LINE_SIZE)
 114
 115 # ROW_OFF_BITS is the number of bits for
 116 # the offset in a row
 117 ROW_OFF_BITS = log2_int(ROW_SIZE)
 118
 119 # INDEX_BITS is the number if bits to
 120 # select a cache line
 121 INDEX_BITS = log2_int(NUM_LINES)
 122
 123 # SET_SIZE_BITS is the log base 2 of the set size
 124 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 125
 126 # TAG_BITS is the number of bits of
 127 # the tag part of the address
 128 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 129
 130 # TAG_WIDTH is the width in bits of each way of the tag RAM
 131 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 132
 133 # WAY_BITS is the number of bits to select a way
 134 WAY_BITS = log2_int(NUM_WAYS)
 135
 136 # Example of layout for 32 lines of 64 bytes:
 137 layout = """\
 138   ..  tag    |index|  line  |
 139   ..         |   row   |    |
 140   ..         |     |---|    | ROW_LINE_BITS  (3)
 141   ..         |     |--- - --| LINE_OFF_BITS (6)
 142   ..         |         |- --| ROW_OFF_BITS  (3)
 143   ..         |----- ---|    | ROW_BITS      (8)
 144   ..         |-----|        | INDEX_BITS    (5)
 145   .. --------|              | TAG_BITS      (45)
 146 """
 147 print (layout)
 148 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 149             (TAG_BITS, INDEX_BITS, ROW_BITS,
 150              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 151 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 152 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 153 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 154
 155 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 156
 157 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 158
 159 def CacheTagArray():
 160     tag_layout = [('valid', 1),
 161                   ('tag', TAG_RAM_WIDTH),
 162                  ]
 163     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 164
 165 def RowPerLineValidArray():
 166     return Array(Signal(name="rows_valid%d" % x) \
 167                         for x in range(ROW_PER_LINE))
 168
 169 # L1 TLB
 170 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 171 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 172 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 173 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 174 TLB_PTE_BITS     = 64
 175 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 176
 177 def ispow2(x):
 178     return (1<<log2_int(x, False)) == x
 179
 180 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 181 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 182 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 183 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 184 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 185 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 186         "geometry bits don't add up"
 187 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 188         "geometry bits don't add up"
 189 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 190          "geometry bits don't add up"
 191 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 192 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 193
 194 def TLBHit(name):
 195     return Record([('valid', 1),
 196                    ('way', TLB_WAY_BITS)], name=name)
 197
 198 def TLBTagEAArray():
 199     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 200                 for x in range (TLB_NUM_WAYS))
 201
 202 def TLBRecord(name):
 203     tlb_layout = [('valid', TLB_NUM_WAYS),
 204                   ('tag', TLB_TAG_WAY_BITS),
 205                   ('pte', TLB_PTE_WAY_BITS)
 206                  ]
 207     return Record(tlb_layout, name=name)
 208
 209 def TLBArray():
 210     return Array(TLBRecord(name="tlb%d" % x) for x in range(TLB_SET_SIZE))
 211
 212 def HitWaySet():
 213     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 214                         for x in range(TLB_NUM_WAYS))
 215
 216 # Cache RAM interface
 217 def CacheRamOut():
 218     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 219                  for x in range(NUM_WAYS))
 220
 221 # PLRU output interface
 222 def PLRUOut():
 223     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 224                 for x in range(NUM_LINES))
 225
 226 # TLB PLRU output interface
 227 def TLBPLRUOut():
 228     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 229                 for x in range(TLB_SET_SIZE))
 230
 231 # Helper functions to decode incoming requests
 232 #
 233 # Return the cache line index (tag index) for an address
 234 def get_index(addr):
 235     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 236
 237 # Return the cache row index (data memory) for an address
 238 def get_row(addr):
 239     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 240
 241 # Return the index of a row within a line
 242 def get_row_of_line(row):
 243     return row[:ROW_BITS][:ROW_LINE_BITS]
 244
 245 # Returns whether this is the last row of a line
 246 def is_last_row_addr(addr, last):
 247     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 248
 249 # Returns whether this is the last row of a line
 250 def is_last_row(row, last):
 251     return get_row_of_line(row) == last
 252
 253 # Return the next row in the current cache line. We use a
 254 # dedicated function in order to limit the size of the
 255 # generated adder to be only the bits within a cache line
 256 # (3 bits with default settings)
 257 def next_row(row):
 258     row_v = row[0:ROW_LINE_BITS] + 1
 259     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 260
 261 # Get the tag value from the address
 262 def get_tag(addr):
 263     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 264
 265 # Read a tag from a tag memory row
 266 def read_tag(way, tagset):
 267     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 268
 269 # Read a TLB tag from a TLB tag memory row
 270 def read_tlb_tag(way, tags):
 271     return tags.word_select(way, TLB_EA_TAG_BITS)
 272
 273 # Write a TLB tag to a TLB tag memory row
 274 def write_tlb_tag(way, tags, tag):
 275     return read_tlb_tag(way, tags).eq(tag)
 276
 277 # Read a PTE from a TLB PTE memory row
 278 def read_tlb_pte(way, ptes):
 279     return ptes.word_select(way, TLB_PTE_BITS)
 280
 281 def write_tlb_pte(way, ptes, newpte):
 282     return read_tlb_pte(way, ptes).eq(newpte)
 283
 284
 285 # Record for storing permission, attribute, etc. bits from a PTE
 286 class PermAttr(RecordObject):
 287     def __init__(self, name=None):
 288         super().__init__(name=name)
 289         self.reference = Signal()
 290         self.changed   = Signal()
 291         self.nocache   = Signal()
 292         self.priv      = Signal()
 293         self.rd_perm   = Signal()
 294         self.wr_perm   = Signal()
 295
 296
 297 def extract_perm_attr(pte):
 298     pa = PermAttr()
 299     return pa;
 300
 301
 302 # Type of operation on a "valid" input
 303 @unique
 304 class Op(Enum):
 305     OP_NONE       = 0
 306     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 307     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 308     OP_LOAD_HIT   = 3 # Cache hit on load
 309     OP_LOAD_MISS  = 4 # Load missing cache
 310     OP_LOAD_NC    = 5 # Non-cachable load
 311     OP_STORE_HIT  = 6 # Store hitting cache
 312     OP_STORE_MISS = 7 # Store missing cache
 313
 314
 315 # Cache state machine
 316 @unique
 317 class State(Enum):
 318     IDLE             = 0 # Normal load hit processing
 319     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 320     STORE_WAIT_ACK   = 2 # Store wait ack
 321     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 322
 323
 324 # Dcache operations:
 325 #
 326 # In order to make timing, we use the BRAMs with
 327 # an output buffer, which means that the BRAM
 328 # output is delayed by an extra cycle.
 329 #
 330 # Thus, the dcache has a 2-stage internal pipeline
 331 # for cache hits with no stalls.
 332 #
 333 # All other operations are handled via stalling
 334 # in the first stage.
 335 #
 336 # The second stage can thus complete a hit at the same
 337 # time as the first stage emits a stall for a complex op.
 338 #
 339 # Stage 0 register, basically contains just the latched request
 340
 341 class RegStage0(RecordObject):
 342     def __init__(self, name=None):
 343         super().__init__(name=name)
 344         self.req     = LoadStore1ToDCacheType(name="lsmem")
 345         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 346         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 347         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 348         self.mmu_req = Signal() # indicates source of request
 349         self.d_valid = Signal() # indicates req.data is valid now
 350
 351
 352 class MemAccessRequest(RecordObject):
 353     def __init__(self, name=None):
 354         super().__init__(name=name)
 355         self.op        = Signal(Op)
 356         self.valid     = Signal()
 357         self.dcbz      = Signal()
 358         self.real_addr = Signal(REAL_ADDR_BITS)
 359         self.data      = Signal(64)
 360         self.byte_sel  = Signal(8)
 361         self.hit_way   = Signal(WAY_BITS)
 362         self.same_tag  = Signal()
 363         self.mmu_req   = Signal()
 364
 365
 366 # First stage register, contains state for stage 1 of load hits
 367 # and for the state machine used by all other operations
 368 class RegStage1(RecordObject):
 369     def __init__(self, name=None):
 370         super().__init__(name=name)
 371         # Info about the request
 372         self.full             = Signal() # have uncompleted request
 373         self.mmu_req          = Signal() # request is from MMU
 374         self.req              = MemAccessRequest(name="reqmem")
 375
 376         # Cache hit state
 377         self.hit_way          = Signal(WAY_BITS)
 378         self.hit_load_valid   = Signal()
 379         self.hit_index        = Signal(INDEX_BITS)
 380         self.cache_hit        = Signal()
 381
 382         # TLB hit state
 383         self.tlb_hit          = TLBHit("tlb_hit")
 384         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 385
 386         # 2-stage data buffer for data forwarded from writes to reads
 387         self.forward_data1    = Signal(64)
 388         self.forward_data2    = Signal(64)
 389         self.forward_sel1     = Signal(8)
 390         self.forward_valid1   = Signal()
 391         self.forward_way1     = Signal(WAY_BITS)
 392         self.forward_row1     = Signal(ROW_BITS)
 393         self.use_forward1     = Signal()
 394         self.forward_sel      = Signal(8)
 395
 396         # Cache miss state (reload state machine)
 397         self.state            = Signal(State)
 398         self.dcbz             = Signal()
 399         self.write_bram       = Signal()
 400         self.write_tag        = Signal()
 401         self.slow_valid       = Signal()
 402         self.wb               = WBMasterOut("wb")
 403         self.reload_tag       = Signal(TAG_BITS)
 404         self.store_way        = Signal(WAY_BITS)
 405         self.store_row        = Signal(ROW_BITS)
 406         self.store_index      = Signal(INDEX_BITS)
 407         self.end_row_ix       = Signal(ROW_LINE_BITS)
 408         self.rows_valid       = RowPerLineValidArray()
 409         self.acks_pending     = Signal(3)
 410         self.inc_acks         = Signal()
 411         self.dec_acks         = Signal()
 412
 413         # Signals to complete (possibly with error)
 414         self.ls_valid         = Signal()
 415         self.ls_error         = Signal()
 416         self.mmu_done         = Signal()
 417         self.mmu_error        = Signal()
 418         self.cache_paradox    = Signal()
 419
 420         # Signal to complete a failed stcx.
 421         self.stcx_fail        = Signal()
 422
 423
 424 # Reservation information
 425 class Reservation(RecordObject):
 426     def __init__(self):
 427         super().__init__()
 428         self.valid = Signal()
 429         self.addr  = Signal(64-LINE_OFF_BITS)
 430
 431
 432 class DTLBUpdate(Elaboratable):
 433     def __init__(self):
 434         self.tlbie    = Signal()
 435         self.tlbwe    = Signal()
 436         self.doall    = Signal()
 437         self.updated  = Signal()
 438         self.v_updated  = Signal()
 439         self.tlb_hit     = TLBHit("tlb_hit")
 440         self.tlb_req_index = Signal(TLB_SET_BITS)
 441
 442         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 443         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 444         self.repl_way        = Signal(TLB_WAY_BITS)
 445         self.eatag           = Signal(TLB_EA_TAG_BITS)
 446         self.pte_data        = Signal(TLB_PTE_BITS)
 447
 448         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 449
 450         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 451         self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 452         self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 453
 454     def elaborate(self, platform):
 455         m = Module()
 456         comb = m.d.comb
 457         sync = m.d.sync
 458
 459         tagset   = Signal(TLB_TAG_WAY_BITS)
 460         pteset   = Signal(TLB_PTE_WAY_BITS)
 461
 462         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 463         comb += db_out.eq(self.dv)
 464
 465         with m.If(self.tlbie & self.doall):
 466             pass # clear all back in parent
 467         with m.Elif(self.tlbie):
 468             with m.If(self.tlb_hit.valid):
 469                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 470                 comb += self.v_updated.eq(1)
 471
 472         with m.Elif(self.tlbwe):
 473
 474             comb += tagset.eq(self.tlb_tag_way)
 475             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 476             comb += tb_out.eq(tagset)
 477
 478             comb += pteset.eq(self.tlb_pte_way)
 479             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 480             comb += pb_out.eq(pteset)
 481
 482             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 483
 484             comb += self.updated.eq(1)
 485             comb += self.v_updated.eq(1)
 486
 487         return m
 488
 489
 490 class DCachePendingHit(Elaboratable):
 491
 492     def __init__(self, tlb_way,
 493                       cache_i_validdx, cache_tag_set,
 494                     req_addr,
 495                     hit_set):
 496
 497         self.go          = Signal()
 498         self.virt_mode   = Signal()
 499         self.is_hit      = Signal()
 500         self.tlb_hit      = TLBHit("tlb_hit")
 501         self.hit_way     = Signal(WAY_BITS)
 502         self.rel_match   = Signal()
 503         self.req_index   = Signal(INDEX_BITS)
 504         self.reload_tag  = Signal(TAG_BITS)
 505
 506         self.tlb_way = tlb_way
 507         self.cache_i_validdx = cache_i_validdx
 508         self.cache_tag_set = cache_tag_set
 509         self.req_addr = req_addr
 510         self.hit_set = hit_set
 511
 512     def elaborate(self, platform):
 513         m = Module()
 514         comb = m.d.comb
 515         sync = m.d.sync
 516
 517         go = self.go
 518         virt_mode = self.virt_mode
 519         is_hit = self.is_hit
 520         tlb_way = self.tlb_way
 521         cache_i_validdx = self.cache_i_validdx
 522         cache_tag_set = self.cache_tag_set
 523         req_addr = self.req_addr
 524         tlb_hit = self.tlb_hit
 525         hit_set = self.hit_set
 526         hit_way = self.hit_way
 527         rel_match = self.rel_match
 528         req_index = self.req_index
 529         reload_tag = self.reload_tag
 530
 531         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 532                                     for i in range(TLB_NUM_WAYS))
 533         hit_way_set = HitWaySet()
 534
 535         # Test if pending request is a hit on any way
 536         # In order to make timing in virtual mode,
 537         # when we are using the TLB, we compare each
 538         # way with each of the real addresses from each way of
 539         # the TLB, and then decide later which match to use.
 540
 541         with m.If(virt_mode):
 542             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 543                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 544                 s_hit       = Signal()
 545                 s_pte       = Signal(TLB_PTE_BITS)
 546                 s_ra        = Signal(REAL_ADDR_BITS)
 547                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 548                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 549                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 550                 comb += s_tag.eq(get_tag(s_ra))
 551
 552                 for i in range(NUM_WAYS): # way_t
 553                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 554                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 555                                   (read_tag(i, cache_tag_set) == s_tag)
 556                                   & (tlb_way.valid[j]))
 557                     with m.If(is_tag_hit):
 558                         comb += hit_way_set[j].eq(i)
 559                         comb += s_hit.eq(1)
 560                 comb += hit_set[j].eq(s_hit)
 561                 with m.If(s_tag == reload_tag):
 562                     comb += rel_matches[j].eq(1)
 563             with m.If(tlb_hit.way):
 564                 comb += is_hit.eq(hit_set[tlb_hit.way])
 565                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 566                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 567         with m.Else():
 568             s_tag       = Signal(TAG_BITS)
 569             comb += s_tag.eq(get_tag(req_addr))
 570             for i in range(NUM_WAYS): # way_t
 571                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 572                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 573                           (read_tag(i, cache_tag_set) == s_tag))
 574                 with m.If(is_tag_hit):
 575                     comb += hit_way.eq(i)
 576                     comb += is_hit.eq(1)
 577             with m.If(s_tag == reload_tag):
 578                 comb += rel_match.eq(1)
 579
 580         return m
 581
 582
 583 class DCache(Elaboratable):
 584     """Set associative dcache write-through
 585
 586     TODO (in no specific order):
 587     * See list in icache.vhdl
 588     * Complete load misses on the cycle when WB data comes instead of
 589       at the end of line (this requires dealing with requests coming in
 590       while not idle...)
 591     """
 592     def __init__(self):
 593         self.d_in      = LoadStore1ToDCacheType("d_in")
 594         self.d_out     = DCacheToLoadStore1Type("d_out")
 595
 596         self.m_in      = MMUToDCacheType("m_in")
 597         self.m_out     = DCacheToMMUType("m_out")
 598
 599         self.stall_out = Signal()
 600
 601         # standard naming (wired to non-standard for compatibility)
 602         self.bus = Interface(addr_width=32,
 603                             data_width=64,
 604                             granularity=8,
 605                             features={'stall'},
 606                             alignment=0,
 607                             name="dcache")
 608
 609         self.log_out   = Signal(20)
 610
 611     def stage_0(self, m, r0, r1, r0_full):
 612         """Latch the request in r0.req as long as we're not stalling
 613         """
 614         comb = m.d.comb
 615         sync = m.d.sync
 616         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 617
 618         r = RegStage0("stage0")
 619
 620         # TODO, this goes in unit tests and formal proofs
 621         with m.If(d_in.valid & m_in.valid):
 622             sync += Display("request collision loadstore vs MMU")
 623
 624         with m.If(m_in.valid):
 625             comb += r.req.valid.eq(1)
 626             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 627             comb += r.req.dcbz.eq(0)
 628             comb += r.req.nc.eq(0)
 629             comb += r.req.reserve.eq(0)
 630             comb += r.req.virt_mode.eq(0)
 631             comb += r.req.priv_mode.eq(1)
 632             comb += r.req.addr.eq(m_in.addr)
 633             comb += r.req.data.eq(m_in.pte)
 634             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 635             comb += r.tlbie.eq(m_in.tlbie)
 636             comb += r.doall.eq(m_in.doall)
 637             comb += r.tlbld.eq(m_in.tlbld)
 638             comb += r.mmu_req.eq(1)
 639             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 640                                  m_in.addr, m_in.pte, r.req.load)
 641
 642         with m.Else():
 643             comb += r.req.eq(d_in)
 644             comb += r.req.data.eq(0)
 645             comb += r.tlbie.eq(0)
 646             comb += r.doall.eq(0)
 647             comb += r.tlbld.eq(0)
 648             comb += r.mmu_req.eq(0)
 649         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 650             sync += r0.eq(r)
 651             sync += r0_full.eq(r.req.valid)
 652             # Sample data the cycle after a request comes in from loadstore1.
 653             # If another request has come in already then the data will get
 654             # put directly into req.data below.
 655             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 656                      ~r0.mmu_req):
 657                 sync += r0.req.data.eq(d_in.data)
 658                 sync += r0.d_valid.eq(1)
 659         with m.If(d_in.valid):
 660             m.d.sync += Display("    DCACHE req cache "
 661                                 "virt %d addr %x data %x ld %d",
 662                                  r.req.virt_mode, r.req.addr,
 663                                  r.req.data, r.req.load)
 664
 665     def tlb_read(self, m, r0_stall, tlb_way, dtlb):
 666         """TLB
 667         Operates in the second cycle on the request latched in r0.req.
 668         TLB updates write the entry at the end of the second cycle.
 669         """
 670         comb = m.d.comb
 671         sync = m.d.sync
 672         m_in, d_in = self.m_in, self.d_in
 673
 674         index    = Signal(TLB_SET_BITS)
 675         addrbits = Signal(TLB_SET_BITS)
 676
 677         amin = TLB_LG_PGSZ
 678         amax = TLB_LG_PGSZ + TLB_SET_BITS
 679
 680         with m.If(m_in.valid):
 681             comb += addrbits.eq(m_in.addr[amin : amax])
 682         with m.Else():
 683             comb += addrbits.eq(d_in.addr[amin : amax])
 684         comb += index.eq(addrbits)
 685
 686         # If we have any op and the previous op isn't finished,
 687         # then keep the same output for next cycle.
 688         with m.If(~r0_stall):
 689             sync += tlb_way.eq(dtlb[index])
 690
 691     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 692         """Generate TLB PLRUs
 693         """
 694         comb = m.d.comb
 695         sync = m.d.sync
 696
 697         if TLB_NUM_WAYS == 0:
 698             return
 699
 700         # Binary-to-Unary one-hot, enabled by tlb_hit valid
 701         m.submodules.tlb_hit_e = te = Decoder(TLB_SET_SIZE)
 702         comb += te.n.eq(~r1.tlb_hit.valid)
 703         comb += te.i.eq(r1.tlb_hit_index)
 704
 705         for i in range(TLB_SET_SIZE):
 706             # TLB PLRU interface
 707             tlb_plru        = PLRU(TLB_WAY_BITS)
 708             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 709             tlb_plru_acc_en = Signal()
 710
 711             comb += tlb_plru_acc_en.eq(te.o[i])
 712             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 713             comb += tlb_plru.acc_i.eq(r1.tlb_hit.way)
 714             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 715
 716     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 717                    tlb_way,
 718                    pte, tlb_hit, valid_ra, perm_attr, ra):
 719
 720         comb = m.d.comb
 721
 722         hitway = Signal(TLB_WAY_BITS)
 723         hit    = Signal()
 724         eatag  = Signal(TLB_EA_TAG_BITS)
 725
 726         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 727         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 728         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 729
 730         for i in range(TLB_NUM_WAYS):
 731             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 732             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 733             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 734             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 735             with m.If(is_tag_hit):
 736                 comb += hitway.eq(i)
 737                 comb += hit.eq(1)
 738
 739         comb += tlb_hit.valid.eq(hit & r0_valid)
 740         comb += tlb_hit.way.eq(hitway)
 741
 742         with m.If(tlb_hit.valid):
 743             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 744         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 745
 746         with m.If(r0.req.virt_mode):
 747             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 748                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 749                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 750             comb += perm_attr.reference.eq(pte[8])
 751             comb += perm_attr.changed.eq(pte[7])
 752             comb += perm_attr.nocache.eq(pte[5])
 753             comb += perm_attr.priv.eq(pte[3])
 754             comb += perm_attr.rd_perm.eq(pte[2])
 755             comb += perm_attr.wr_perm.eq(pte[1])
 756         with m.Else():
 757             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 758                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 759             comb += perm_attr.reference.eq(1)
 760             comb += perm_attr.changed.eq(1)
 761             comb += perm_attr.nocache.eq(0)
 762             comb += perm_attr.priv.eq(1)
 763             comb += perm_attr.rd_perm.eq(1)
 764             comb += perm_attr.wr_perm.eq(1)
 765
 766         with m.If(valid_ra):
 767             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 768                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 769             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 770             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 771             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 772             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 773             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 774             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 775
 776     def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
 777                     tlb_hit, tlb_plru_victim, tlb_way):
 778
 779         comb = m.d.comb
 780         sync = m.d.sync
 781
 782         tlbie    = Signal()
 783         tlbwe    = Signal()
 784
 785         comb += tlbie.eq(r0_valid & r0.tlbie)
 786         comb += tlbwe.eq(r0_valid & r0.tlbld)
 787
 788         m.submodules.tlb_update = d = DTLBUpdate()
 789         with m.If(tlbie & r0.doall):
 790             # clear all valid bits at once
 791             for i in range(TLB_SET_SIZE):
 792                 sync += dtlb[i].valid.eq(0)
 793         with m.If(d.updated):
 794             sync += dtlb[tlb_req_index].tag.eq(d.tb_out)
 795             sync += dtlb[tlb_req_index].pte.eq(d.pb_out)
 796         with m.If(d.v_updated):
 797             sync += dtlb[tlb_req_index].valid.eq(d.db_out)
 798
 799         comb += d.dv.eq(dtlb[tlb_req_index].valid)
 800
 801         comb += d.tlbie.eq(tlbie)
 802         comb += d.tlbwe.eq(tlbwe)
 803         comb += d.doall.eq(r0.doall)
 804         comb += d.tlb_hit.eq(tlb_hit)
 805         comb += d.tlb_tag_way.eq(tlb_way.tag)
 806         comb += d.tlb_pte_way.eq(tlb_way.pte)
 807         comb += d.tlb_req_index.eq(tlb_req_index)
 808
 809         with m.If(tlb_hit.valid):
 810             comb += d.repl_way.eq(tlb_hit.way)
 811         with m.Else():
 812             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 813         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 814         comb += d.pte_data.eq(r0.req.data)
 815
 816     def maybe_plrus(self, m, r1, plru_victim):
 817         """Generate PLRUs
 818         """
 819         comb = m.d.comb
 820         sync = m.d.sync
 821
 822         if TLB_NUM_WAYS == 0:
 823             return
 824
 825         # XXX TODO: use a Binary-to-Unary one-hot here,
 826         # enabled by cache_hit
 827         m.submodules.hit_e = he = Decoder(NUM_LINES)
 828         comb += he.n.eq(~r1.cache_hit)
 829         comb += he.i.eq(r1.hit_index)
 830
 831         for i in range(NUM_LINES):
 832             # PLRU interface
 833             plru        = PLRU(WAY_BITS)
 834             m.submodules["plru%d" % i] = plru
 835             plru_acc_en = Signal()
 836
 837             comb += plru_acc_en.eq(he.o[i])
 838             comb += plru.acc_en.eq(plru_acc_en)
 839             comb += plru.acc_i.eq(r1.hit_way)
 840             comb += plru_victim[i].eq(plru.lru_o)
 841
 842     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 843         """Cache tag RAM read port
 844         """
 845         comb = m.d.comb
 846         sync = m.d.sync
 847         m_in, d_in = self.m_in, self.d_in
 848
 849         index = Signal(INDEX_BITS)
 850
 851         with m.If(r0_stall):
 852             comb += index.eq(req_index)
 853         with m.Elif(m_in.valid):
 854             comb += index.eq(get_index(m_in.addr))
 855         with m.Else():
 856             comb += index.eq(get_index(d_in.addr))
 857         sync += cache_tag_set.eq(cache_tags[index].tag)
 858
 859     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 860                        r0_valid, r1, cache_tags, replace_way,
 861                        use_forward1_next, use_forward2_next,
 862                        req_hit_way, plru_victim, rc_ok, perm_attr,
 863                        valid_ra, perm_ok, access_ok, req_op, req_go,
 864                        tlb_hit, tlb_way, cache_tag_set,
 865                        cancel_store, req_same_tag, r0_stall, early_req_row):
 866         """Cache request parsing and hit detection
 867         """
 868
 869         comb = m.d.comb
 870         m_in, d_in = self.m_in, self.d_in
 871
 872         is_hit      = Signal()
 873         hit_way     = Signal(WAY_BITS)
 874         op          = Signal(Op)
 875         opsel       = Signal(3)
 876         go          = Signal()
 877         nc          = Signal()
 878         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 879                                   for i in range(TLB_NUM_WAYS))
 880         cache_i_validdx = Signal(NUM_WAYS)
 881
 882         # Extract line, row and tag from request
 883         comb += req_index.eq(get_index(r0.req.addr))
 884         comb += req_row.eq(get_row(r0.req.addr))
 885         comb += req_tag.eq(get_tag(ra))
 886
 887         if False: # display on comb is a bit... busy.
 888             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 889                     r0.req.addr, ra, req_index, req_tag, req_row)
 890
 891         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 892         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 893
 894         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 895                                             cache_i_validdx, cache_tag_set,
 896                                             r0.req.addr,
 897                                             hit_set)
 898         comb += dc.tlb_hit.eq(tlb_hit)
 899         comb += dc.reload_tag.eq(r1.reload_tag)
 900         comb += dc.virt_mode.eq(r0.req.virt_mode)
 901         comb += dc.go.eq(go)
 902         comb += dc.req_index.eq(req_index)
 903
 904         comb += is_hit.eq(dc.is_hit)
 905         comb += hit_way.eq(dc.hit_way)
 906         comb += req_same_tag.eq(dc.rel_match)
 907
 908         # See if the request matches the line currently being reloaded
 909         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 910                   (req_index == r1.store_index) & req_same_tag):
 911             # For a store, consider this a hit even if the row isn't
 912             # valid since it will be by the time we perform the store.
 913             # For a load, check the appropriate row valid bit.
 914             rrow = Signal(ROW_LINE_BITS)
 915             comb += rrow.eq(req_row)
 916             valid = r1.rows_valid[rrow]
 917             comb += is_hit.eq((~r0.req.load) | valid)
 918             comb += hit_way.eq(replace_way)
 919
 920         # Whether to use forwarded data for a load or not
 921         with m.If((get_row(r1.req.real_addr) == req_row) &
 922                   (r1.req.hit_way == hit_way)):
 923             # Only need to consider r1.write_bram here, since if we
 924             # are writing refill data here, then we don't have a
 925             # cache hit this cycle on the line being refilled.
 926             # (There is the possibility that the load following the
 927             # load miss that started the refill could be to the old
 928             # contents of the victim line, since it is a couple of
 929             # cycles after the refill starts before we see the updated
 930             # cache tag. In that case we don't use the bypass.)
 931             comb += use_forward1_next.eq(r1.write_bram)
 932         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 933             comb += use_forward2_next.eq(r1.forward_valid1)
 934
 935         # The way that matched on a hit
 936         comb += req_hit_way.eq(hit_way)
 937
 938         # The way to replace on a miss
 939         with m.If(r1.write_tag):
 940             comb += replace_way.eq(plru_victim[r1.store_index])
 941         with m.Else():
 942             comb += replace_way.eq(r1.store_way)
 943
 944         # work out whether we have permission for this access
 945         # NB we don't yet implement AMR, thus no KUAP
 946         comb += rc_ok.eq(perm_attr.reference
 947                          & (r0.req.load | perm_attr.changed))
 948         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 949                            (perm_attr.wr_perm |
 950                               (r0.req.load & perm_attr.rd_perm)))
 951         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 952
 953         # Combine the request and cache hit status to decide what
 954         # operation needs to be done
 955         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 956         comb += op.eq(Op.OP_NONE)
 957         with m.If(go):
 958             with m.If(~access_ok):
 959                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
 960                                  valid_ra, perm_ok, rc_ok)
 961                 comb += op.eq(Op.OP_BAD)
 962             with m.Elif(cancel_store):
 963                 m.d.sync += Display("DCACHE cancel store")
 964                 comb += op.eq(Op.OP_STCX_FAIL)
 965             with m.Else():
 966                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
 967                                  valid_ra, nc, r0.req.load)
 968                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 969                 with m.Switch(opsel):
 970                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 971                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 972                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 973                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 974                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 975                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 976                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 977                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 978         comb += req_op.eq(op)
 979         comb += req_go.eq(go)
 980
 981         # Version of the row number that is valid one cycle earlier
 982         # in the cases where we need to read the cache data BRAM.
 983         # If we're stalling then we need to keep reading the last
 984         # row requested.
 985         with m.If(~r0_stall):
 986             with m.If(m_in.valid):
 987                 comb += early_req_row.eq(get_row(m_in.addr))
 988             with m.Else():
 989                 comb += early_req_row.eq(get_row(d_in.addr))
 990         with m.Else():
 991             comb += early_req_row.eq(req_row)
 992
 993     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 994                          r0_valid, r0, reservation):
 995         """Handle load-with-reservation and store-conditional instructions
 996         """
 997         comb = m.d.comb
 998
 999         with m.If(r0_valid & r0.req.reserve):
1000             # XXX generate alignment interrupt if address
1001             # is not aligned XXX or if r0.req.nc = '1'
1002             with m.If(r0.req.load):
1003                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1004             with m.Else():
1005                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1006                 with m.If((~reservation.valid) |
1007                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1008                     comb += cancel_store.eq(1)
1009
1010     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1011                         reservation, r0):
1012         comb = m.d.comb
1013         sync = m.d.sync
1014
1015         with m.If(r0_valid & access_ok):
1016             with m.If(clear_rsrv):
1017                 sync += reservation.valid.eq(0)
1018             with m.Elif(set_rsrv):
1019                 sync += reservation.valid.eq(1)
1020                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1021
1022     def writeback_control(self, m, r1, cache_out_row):
1023         """Return data for loads & completion control logic
1024         """
1025         comb = m.d.comb
1026         sync = m.d.sync
1027         d_out, m_out = self.d_out, self.m_out
1028
1029         data_out = Signal(64)
1030         data_fwd = Signal(64)
1031
1032         # Use the bypass if are reading the row that was
1033         # written 1 or 2 cycles ago, including for the
1034         # slow_valid = 1 case (i.e. completing a load
1035         # miss or a non-cacheable load).
1036         with m.If(r1.use_forward1):
1037             comb += data_fwd.eq(r1.forward_data1)
1038         with m.Else():
1039             comb += data_fwd.eq(r1.forward_data2)
1040
1041         comb += data_out.eq(cache_out_row)
1042
1043         for i in range(8):
1044             with m.If(r1.forward_sel[i]):
1045                 dsel = data_fwd.word_select(i, 8)
1046                 comb += data_out.word_select(i, 8).eq(dsel)
1047
1048         # DCache output to LoadStore
1049         comb += d_out.valid.eq(r1.ls_valid)
1050         comb += d_out.data.eq(data_out)
1051         comb += d_out.store_done.eq(~r1.stcx_fail)
1052         comb += d_out.error.eq(r1.ls_error)
1053         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1054
1055         # Outputs to MMU
1056         comb += m_out.done.eq(r1.mmu_done)
1057         comb += m_out.err.eq(r1.mmu_error)
1058         comb += m_out.data.eq(data_out)
1059
1060         # We have a valid load or store hit or we just completed
1061         # a slow op such as a load miss, a NC load or a store
1062         #
1063         # Note: the load hit is delayed by one cycle. However it
1064         # can still not collide with r.slow_valid (well unless I
1065         # miscalculated) because slow_valid can only be set on a
1066         # subsequent request and not on its first cycle (the state
1067         # machine must have advanced), which makes slow_valid
1068         # at least 2 cycles from the previous hit_load_valid.
1069
1070         # Sanity: Only one of these must be set in any given cycle
1071
1072         if False: # TODO: need Display to get this to work
1073             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1074             "unexpected slow_valid collision with stcx_fail"
1075
1076             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1077              "unexpected hit_load_delayed collision with slow_valid"
1078
1079         with m.If(~r1.mmu_req):
1080             # Request came from loadstore1...
1081             # Load hit case is the standard path
1082             with m.If(r1.hit_load_valid):
1083                 sync += Display("completing load hit data=%x", data_out)
1084
1085             # error cases complete without stalling
1086             with m.If(r1.ls_error):
1087                 with m.If(r1.dcbz):
1088                     sync += Display("completing dcbz with error")
1089                 with m.Else():
1090                     sync += Display("completing ld/st with error")
1091
1092             # Slow ops (load miss, NC, stores)
1093             with m.If(r1.slow_valid):
1094                 sync += Display("completing store or load miss adr=%x data=%x",
1095                                 r1.req.real_addr, data_out)
1096
1097         with m.Else():
1098             # Request came from MMU
1099             with m.If(r1.hit_load_valid):
1100                 sync += Display("completing load hit to MMU, data=%x",
1101                                 m_out.data)
1102             # error cases complete without stalling
1103             with m.If(r1.mmu_error):
1104                 sync += Display("combpleting MMU ld with error")
1105
1106             # Slow ops (i.e. load miss)
1107             with m.If(r1.slow_valid):
1108                 sync += Display("completing MMU load miss, adr=%x data=%x",
1109                                 r1.req.real_addr, m_out.data)
1110
1111     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1112         """rams
1113         Generate a cache RAM for each way. This handles the normal
1114         reads, writes from reloads and the special store-hit update
1115         path as well.
1116
1117         Note: the BRAMs have an extra read buffer, meaning the output
1118         is pipelined an extra cycle. This differs from the
1119         icache. The writeback logic needs to take that into
1120         account by using 1-cycle delayed signals for load hits.
1121         """
1122         comb = m.d.comb
1123         bus = self.bus
1124
1125         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1126         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1127         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1128         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1129                    ~r1.write_bram))
1130         comb += rwe.i.eq(replace_way)
1131
1132         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1133         comb += hwe.i.eq(r1.hit_way)
1134
1135         # this one is gated with write_bram, and replace_way_e can never be
1136         # set at the same time.  that means that do_write can OR the outputs
1137         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1138         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1139         comb += hre.i.eq(r1.req.hit_way)
1140
1141         do_read  = Signal()
1142         comb += do_read.eq(1)
1143
1144         for i in range(NUM_WAYS):
1145             rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
1146             do_write = Signal(name="do_wr%d" % i)
1147             wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
1148             wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
1149             wr_sel   = Signal(ROW_SIZE)
1150             wr_sel_m = Signal(ROW_SIZE)
1151             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1152
1153             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1154             setattr(m.submodules, "cacheram_%d" % i, way)
1155
1156             comb += way.rd_en.eq(do_read)
1157             comb += way.rd_addr.eq(rd_addr)
1158             comb += _d_out.eq(way.rd_data_o)
1159             comb += way.wr_sel.eq(wr_sel_m)
1160             comb += way.wr_addr.eq(wr_addr)
1161             comb += way.wr_data.eq(wr_data)
1162
1163             # Cache hit reads
1164             comb += rd_addr.eq(early_req_row)
1165             with m.If(hwe.o[i]):
1166                 comb += cache_out_row.eq(_d_out)
1167
1168             # Write mux:
1169             #
1170             # Defaults to wishbone read responses (cache refill)
1171             #
1172             # For timing, the mux on wr_data/sel/addr is not
1173             # dependent on anything other than the current state.
1174
1175             with m.If(r1.write_bram):
1176                 # Write store data to BRAM.  This happens one
1177                 # cycle after the store is in r0.
1178                 comb += wr_data.eq(r1.req.data)
1179                 comb += wr_sel.eq(r1.req.byte_sel)
1180                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1181
1182             with m.Else():
1183                 # Otherwise, we might be doing a reload or a DCBZ
1184                 with m.If(r1.dcbz):
1185                     comb += wr_data.eq(0)
1186                 with m.Else():
1187                     comb += wr_data.eq(bus.dat_r)
1188                 comb += wr_addr.eq(r1.store_row)
1189                 comb += wr_sel.eq(~0) # all 1s
1190
1191             # these are mutually-exclusive via their Decoder-enablers
1192             # (note: Decoder-enable is inverted)
1193             comb += do_write.eq(hre.o[i] | rwe.o[i])
1194
1195             # Mask write selects with do_write since BRAM
1196             # doesn't have a global write-enable
1197             with m.If(do_write):
1198                 comb += wr_sel_m.eq(wr_sel)
1199
1200     # Cache hit synchronous machine for the easy case.
1201     # This handles load hits.
1202     # It also handles error cases (TLB miss, cache paradox)
1203     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1204                         req_hit_way, req_index, req_tag, access_ok,
1205                         tlb_hit, tlb_req_index):
1206         comb = m.d.comb
1207         sync = m.d.sync
1208
1209         with m.If(req_op != Op.OP_NONE):
1210             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1211                     req_op, r0.req.addr, r0.req.nc,
1212                     req_index, req_tag, req_hit_way)
1213
1214         with m.If(r0_valid):
1215             sync += r1.mmu_req.eq(r0.mmu_req)
1216
1217         # Fast path for load/store hits.
1218         # Set signals for the writeback controls.
1219         sync += r1.hit_way.eq(req_hit_way)
1220         sync += r1.hit_index.eq(req_index)
1221
1222         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1223         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1224                                 (req_op == Op.OP_STORE_HIT))
1225
1226         with m.If(req_op == Op.OP_BAD):
1227             sync += Display("Signalling ld/st error "
1228                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1229                             ~r0.mmu_req,r0.mmu_req,access_ok)
1230             sync += r1.ls_error.eq(~r0.mmu_req)
1231             sync += r1.mmu_error.eq(r0.mmu_req)
1232             sync += r1.cache_paradox.eq(access_ok)
1233         with m.Else():
1234             sync += r1.ls_error.eq(0)
1235             sync += r1.mmu_error.eq(0)
1236             sync += r1.cache_paradox.eq(0)
1237
1238         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1239
1240         # Record TLB hit information for updating TLB PLRU
1241         sync += r1.tlb_hit.eq(tlb_hit)
1242         sync += r1.tlb_hit_index.eq(tlb_req_index)
1243
1244     # Memory accesses are handled by this state machine:
1245     #
1246     #   * Cache load miss/reload (in conjunction with "rams")
1247     #   * Load hits for non-cachable forms
1248     #   * Stores (the collision case is handled in "rams")
1249     #
1250     # All wishbone requests generation is done here.
1251     # This machine operates at stage 1.
1252     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1253                     r0, replace_way,
1254                     req_hit_way, req_same_tag,
1255                     r0_valid, req_op, cache_tags, req_go, ra):
1256
1257         comb = m.d.comb
1258         sync = m.d.sync
1259         bus = self.bus
1260         d_in = self.d_in
1261
1262         req         = MemAccessRequest("mreq_ds")
1263
1264         req_row = Signal(ROW_BITS)
1265         req_idx = Signal(INDEX_BITS)
1266         req_tag = Signal(TAG_BITS)
1267         comb += req_idx.eq(get_index(req.real_addr))
1268         comb += req_row.eq(get_row(req.real_addr))
1269         comb += req_tag.eq(get_tag(req.real_addr))
1270
1271         sync += r1.use_forward1.eq(use_forward1_next)
1272         sync += r1.forward_sel.eq(0)
1273
1274         with m.If(use_forward1_next):
1275             sync += r1.forward_sel.eq(r1.req.byte_sel)
1276         with m.Elif(use_forward2_next):
1277             sync += r1.forward_sel.eq(r1.forward_sel1)
1278
1279         sync += r1.forward_data2.eq(r1.forward_data1)
1280         with m.If(r1.write_bram):
1281             sync += r1.forward_data1.eq(r1.req.data)
1282             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1283             sync += r1.forward_way1.eq(r1.req.hit_way)
1284             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1285             sync += r1.forward_valid1.eq(1)
1286         with m.Else():
1287             with m.If(r1.dcbz):
1288                 sync += r1.forward_data1.eq(0)
1289             with m.Else():
1290                 sync += r1.forward_data1.eq(bus.dat_r)
1291             sync += r1.forward_sel1.eq(~0) # all 1s
1292             sync += r1.forward_way1.eq(replace_way)
1293             sync += r1.forward_row1.eq(r1.store_row)
1294             sync += r1.forward_valid1.eq(0)
1295
1296         # One cycle pulses reset
1297         sync += r1.slow_valid.eq(0)
1298         sync += r1.write_bram.eq(0)
1299         sync += r1.inc_acks.eq(0)
1300         sync += r1.dec_acks.eq(0)
1301
1302         sync += r1.ls_valid.eq(0)
1303         # complete tlbies and TLB loads in the third cycle
1304         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1305
1306         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1307             with m.If(~r0.mmu_req):
1308                 sync += r1.ls_valid.eq(1)
1309             with m.Else():
1310                 sync += r1.mmu_done.eq(1)
1311
1312         with m.If(r1.write_tag):
1313             # Store new tag in selected way
1314             replace_way_onehot = Signal(NUM_WAYS)
1315             comb += replace_way_onehot.eq(1<<replace_way)
1316             for i in range(NUM_WAYS):
1317                 with m.If(replace_way_onehot[i]):
1318                     ct = Signal(TAG_RAM_WIDTH)
1319                     comb += ct.eq(cache_tags[r1.store_index].tag)
1320                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1321                     sync += cache_tags[r1.store_index].tag.eq(ct)
1322             sync += r1.store_way.eq(replace_way)
1323             sync += r1.write_tag.eq(0)
1324
1325         # Take request from r1.req if there is one there,
1326         # else from req_op, ra, etc.
1327         with m.If(r1.full):
1328             comb += req.eq(r1.req)
1329         with m.Else():
1330             comb += req.op.eq(req_op)
1331             comb += req.valid.eq(req_go)
1332             comb += req.mmu_req.eq(r0.mmu_req)
1333             comb += req.dcbz.eq(r0.req.dcbz)
1334             comb += req.real_addr.eq(ra)
1335
1336             with m.If(r0.req.dcbz):
1337                 # force data to 0 for dcbz
1338                 comb += req.data.eq(0)
1339             with m.Elif(r0.d_valid):
1340                 comb += req.data.eq(r0.req.data)
1341             with m.Else():
1342                 comb += req.data.eq(d_in.data)
1343
1344             # Select all bytes for dcbz
1345             # and for cacheable loads
1346             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1347                 comb += req.byte_sel.eq(~0) # all 1s
1348             with m.Else():
1349                 comb += req.byte_sel.eq(r0.req.byte_sel)
1350             comb += req.hit_way.eq(req_hit_way)
1351             comb += req.same_tag.eq(req_same_tag)
1352
1353             # Store the incoming request from r0,
1354             # if it is a slow request
1355             # Note that r1.full = 1 implies req_op = OP_NONE
1356             with m.If((req_op == Op.OP_LOAD_MISS)
1357                       | (req_op == Op.OP_LOAD_NC)
1358                       | (req_op == Op.OP_STORE_MISS)
1359                       | (req_op == Op.OP_STORE_HIT)):
1360                 sync += r1.req.eq(req)
1361                 sync += r1.full.eq(1)
1362
1363         # Main state machine
1364         with m.Switch(r1.state):
1365
1366             with m.Case(State.IDLE):
1367                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1368                 sync += r1.wb.sel.eq(req.byte_sel)
1369                 sync += r1.wb.dat.eq(req.data)
1370                 sync += r1.dcbz.eq(req.dcbz)
1371
1372                 # Keep track of our index and way
1373                 # for subsequent stores.
1374                 sync += r1.store_index.eq(req_idx)
1375                 sync += r1.store_row.eq(req_row)
1376                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1377                 sync += r1.reload_tag.eq(req_tag)
1378                 sync += r1.req.same_tag.eq(1)
1379
1380                 with m.If(req.op == Op.OP_STORE_HIT):
1381                     sync += r1.store_way.eq(req.hit_way)
1382
1383                 # Reset per-row valid bits,
1384                 # ready for handling OP_LOAD_MISS
1385                 for i in range(ROW_PER_LINE):
1386                     sync += r1.rows_valid[i].eq(0)
1387
1388                 with m.If(req_op != Op.OP_NONE):
1389                     sync += Display("cache op %d", req.op)
1390
1391                 with m.Switch(req.op):
1392                     with m.Case(Op.OP_LOAD_HIT):
1393                         # stay in IDLE state
1394                         pass
1395
1396                     with m.Case(Op.OP_LOAD_MISS):
1397                         sync += Display("cache miss real addr: %x " \
1398                                 "idx: %x tag: %x",
1399                                 req.real_addr, req_row, req_tag)
1400
1401                         # Start the wishbone cycle
1402                         sync += r1.wb.we.eq(0)
1403                         sync += r1.wb.cyc.eq(1)
1404                         sync += r1.wb.stb.eq(1)
1405
1406                         # Track that we had one request sent
1407                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1408                         sync += r1.write_tag.eq(1)
1409
1410                     with m.Case(Op.OP_LOAD_NC):
1411                         sync += r1.wb.cyc.eq(1)
1412                         sync += r1.wb.stb.eq(1)
1413                         sync += r1.wb.we.eq(0)
1414                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1415
1416                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1417                         with m.If(~req.dcbz):
1418                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1419                             sync += r1.acks_pending.eq(1)
1420                             sync += r1.full.eq(0)
1421                             sync += r1.slow_valid.eq(1)
1422
1423                             with m.If(~req.mmu_req):
1424                                 sync += r1.ls_valid.eq(1)
1425                             with m.Else():
1426                                 sync += r1.mmu_done.eq(1)
1427
1428                             with m.If(req.op == Op.OP_STORE_HIT):
1429                                 sync += r1.write_bram.eq(1)
1430                         with m.Else():
1431                             # dcbz is handled much like a load miss except
1432                             # that we are writing to memory instead of reading
1433                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1434
1435                             with m.If(req.op == Op.OP_STORE_MISS):
1436                                 sync += r1.write_tag.eq(1)
1437
1438                         sync += r1.wb.we.eq(1)
1439                         sync += r1.wb.cyc.eq(1)
1440                         sync += r1.wb.stb.eq(1)
1441
1442                     # OP_NONE and OP_BAD do nothing
1443                     # OP_BAD & OP_STCX_FAIL were
1444                     # handled above already
1445                     with m.Case(Op.OP_NONE):
1446                         pass
1447                     with m.Case(Op.OP_BAD):
1448                         pass
1449                     with m.Case(Op.OP_STCX_FAIL):
1450                         pass
1451
1452             with m.Case(State.RELOAD_WAIT_ACK):
1453                 ld_stbs_done = Signal()
1454                 # Requests are all sent if stb is 0
1455                 comb += ld_stbs_done.eq(~r1.wb.stb)
1456
1457                 # If we are still sending requests, was one accepted?
1458                 with m.If((~bus.stall) & r1.wb.stb):
1459                     # That was the last word?  We are done sending.
1460                     # Clear stb and set ld_stbs_done so we can handle an
1461                     # eventual last ack on the same cycle.
1462                     # sigh - reconstruct wb adr with 3 extra 0s at front
1463                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1464                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1465                         sync += r1.wb.stb.eq(0)
1466                         comb += ld_stbs_done.eq(1)
1467
1468                     # Calculate the next row address in the current cache line
1469                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1470                     comb += row.eq(r1.wb.adr)
1471                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1472
1473                 # Incoming acks processing
1474                 sync += r1.forward_valid1.eq(bus.ack)
1475                 with m.If(bus.ack):
1476                     srow = Signal(ROW_LINE_BITS)
1477                     comb += srow.eq(r1.store_row)
1478                     sync += r1.rows_valid[srow].eq(1)
1479
1480                     # If this is the data we were looking for,
1481                     # we can complete the request next cycle.
1482                     # Compare the whole address in case the
1483                     # request in r1.req is not the one that
1484                     # started this refill.
1485                     with m.If(req.valid & r1.req.same_tag &
1486                               ((r1.dcbz & r1.req.dcbz) |
1487                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1488                                 (r1.store_row == get_row(req.real_addr))):
1489                         sync += r1.full.eq(0)
1490                         sync += r1.slow_valid.eq(1)
1491                         with m.If(~r1.mmu_req):
1492                             sync += r1.ls_valid.eq(1)
1493                         with m.Else():
1494                             sync += r1.mmu_done.eq(1)
1495                         sync += r1.forward_sel.eq(~0) # all 1s
1496                         sync += r1.use_forward1.eq(1)
1497
1498                     # Check for completion
1499                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1500                                                       r1.end_row_ix)):
1501                         # Complete wishbone cycle
1502                         sync += r1.wb.cyc.eq(0)
1503
1504                         # Cache line is now valid
1505                         cv = Signal(INDEX_BITS)
1506                         comb += cv.eq(cache_tags[r1.store_index].valid)
1507                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1508                         sync += cache_tags[r1.store_index].valid.eq(cv)
1509
1510                         sync += r1.state.eq(State.IDLE)
1511                         sync += Display("cache valid set %x "
1512                                         "idx %d way %d",
1513                                          cv, r1.store_index, r1.store_way)
1514
1515                     # Increment store row counter
1516                     sync += r1.store_row.eq(next_row(r1.store_row))
1517
1518             with m.Case(State.STORE_WAIT_ACK):
1519                 st_stbs_done = Signal()
1520                 acks        = Signal(3)
1521                 adjust_acks = Signal(3)
1522
1523                 comb += st_stbs_done.eq(~r1.wb.stb)
1524                 comb += acks.eq(r1.acks_pending)
1525
1526                 with m.If(r1.inc_acks != r1.dec_acks):
1527                     with m.If(r1.inc_acks):
1528                         comb += adjust_acks.eq(acks + 1)
1529                     with m.Else():
1530                         comb += adjust_acks.eq(acks - 1)
1531                 with m.Else():
1532                     comb += adjust_acks.eq(acks)
1533
1534                 sync += r1.acks_pending.eq(adjust_acks)
1535
1536                 # Clear stb when slave accepted request
1537                 with m.If(~bus.stall):
1538                     # See if there is another store waiting
1539                     # to be done which is in the same real page.
1540                     with m.If(req.valid):
1541                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1542                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1543                         sync += r1.wb.dat.eq(req.data)
1544                         sync += r1.wb.sel.eq(req.byte_sel)
1545
1546                     with m.If((adjust_acks < 7) & req.same_tag &
1547                                 ((req.op == Op.OP_STORE_MISS)
1548                                  | (req.op == Op.OP_STORE_HIT))):
1549                         sync += r1.wb.stb.eq(1)
1550                         comb += st_stbs_done.eq(0)
1551
1552                         with m.If(req.op == Op.OP_STORE_HIT):
1553                             sync += r1.write_bram.eq(1)
1554                         sync += r1.full.eq(0)
1555                         sync += r1.slow_valid.eq(1)
1556
1557                         # Store requests never come from the MMU
1558                         sync += r1.ls_valid.eq(1)
1559                         comb += st_stbs_done.eq(0)
1560                         sync += r1.inc_acks.eq(1)
1561                     with m.Else():
1562                         sync += r1.wb.stb.eq(0)
1563                         comb += st_stbs_done.eq(1)
1564
1565                 # Got ack ? See if complete.
1566                 with m.If(bus.ack):
1567                     with m.If(st_stbs_done & (adjust_acks == 1)):
1568                         sync += r1.state.eq(State.IDLE)
1569                         sync += r1.wb.cyc.eq(0)
1570                         sync += r1.wb.stb.eq(0)
1571                     sync += r1.dec_acks.eq(1)
1572
1573             with m.Case(State.NC_LOAD_WAIT_ACK):
1574                 # Clear stb when slave accepted request
1575                 with m.If(~bus.stall):
1576                     sync += r1.wb.stb.eq(0)
1577
1578                 # Got ack ? complete.
1579                 with m.If(bus.ack):
1580                     sync += r1.state.eq(State.IDLE)
1581                     sync += r1.full.eq(0)
1582                     sync += r1.slow_valid.eq(1)
1583
1584                     with m.If(~r1.mmu_req):
1585                         sync += r1.ls_valid.eq(1)
1586                     with m.Else():
1587                         sync += r1.mmu_done.eq(1)
1588
1589                     sync += r1.forward_sel.eq(~0) # all 1s
1590                     sync += r1.use_forward1.eq(1)
1591                     sync += r1.wb.cyc.eq(0)
1592                     sync += r1.wb.stb.eq(0)
1593
1594     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1595
1596         sync = m.d.sync
1597         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1598
1599         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1600                                stall_out, req_op[:3], d_out.valid, d_out.error,
1601                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1602                                r1.real_adr[3:6]))
1603
1604     def elaborate(self, platform):
1605
1606         m = Module()
1607         comb = m.d.comb
1608         d_in = self.d_in
1609
1610         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1611         cache_tags       = CacheTagArray()
1612         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1613
1614         # TODO attribute ram_style : string;
1615         # TODO attribute ram_style of cache_tags : signal is "distributed";
1616
1617         """note: these are passed to nmigen.hdl.Memory as "attributes".
1618            don't know how, just that they are.
1619         """
1620         dtlb            = TLBArray()
1621         # TODO attribute ram_style of
1622         #  dtlb_tags : signal is "distributed";
1623         # TODO attribute ram_style of
1624         #  dtlb_ptes : signal is "distributed";
1625
1626         r0      = RegStage0("r0")
1627         r0_full = Signal()
1628
1629         r1 = RegStage1("r1")
1630
1631         reservation = Reservation()
1632
1633         # Async signals on incoming request
1634         req_index    = Signal(INDEX_BITS)
1635         req_row      = Signal(ROW_BITS)
1636         req_hit_way  = Signal(WAY_BITS)
1637         req_tag      = Signal(TAG_BITS)
1638         req_op       = Signal(Op)
1639         req_data     = Signal(64)
1640         req_same_tag = Signal()
1641         req_go       = Signal()
1642
1643         early_req_row     = Signal(ROW_BITS)
1644
1645         cancel_store      = Signal()
1646         set_rsrv          = Signal()
1647         clear_rsrv        = Signal()
1648
1649         r0_valid          = Signal()
1650         r0_stall          = Signal()
1651
1652         use_forward1_next = Signal()
1653         use_forward2_next = Signal()
1654
1655         cache_out_row     = Signal(WB_DATA_BITS)
1656
1657         plru_victim       = PLRUOut()
1658         replace_way       = Signal(WAY_BITS)
1659
1660         # Wishbone read/write/cache write formatting signals
1661         bus_sel           = Signal(8)
1662
1663         # TLB signals
1664         tlb_way       = TLBRecord("tlb_way")
1665         tlb_req_index = Signal(TLB_SET_BITS)
1666         tlb_hit       = TLBHit("tlb_hit")
1667         pte           = Signal(TLB_PTE_BITS)
1668         ra            = Signal(REAL_ADDR_BITS)
1669         valid_ra      = Signal()
1670         perm_attr     = PermAttr("dc_perms")
1671         rc_ok         = Signal()
1672         perm_ok       = Signal()
1673         access_ok     = Signal()
1674
1675         tlb_plru_victim = TLBPLRUOut()
1676
1677         # we don't yet handle collisions between loadstore1 requests
1678         # and MMU requests
1679         comb += self.m_out.stall.eq(0)
1680
1681         # Hold off the request in r0 when r1 has an uncompleted request
1682         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1683         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1684         comb += self.stall_out.eq(r0_stall)
1685
1686         # deal with litex not doing wishbone pipeline mode
1687         # XXX in wrong way.  FIFOs are needed in the SRAM test
1688         # so that stb/ack match up. same thing done in icache.py
1689         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1690
1691         # Wire up wishbone request latch out of stage 1
1692         comb += self.bus.we.eq(r1.wb.we)
1693         comb += self.bus.adr.eq(r1.wb.adr)
1694         comb += self.bus.sel.eq(r1.wb.sel)
1695         comb += self.bus.stb.eq(r1.wb.stb)
1696         comb += self.bus.dat_w.eq(r1.wb.dat)
1697         comb += self.bus.cyc.eq(r1.wb.cyc)
1698
1699         # call sub-functions putting everything together, using shared
1700         # signals established above
1701         self.stage_0(m, r0, r1, r0_full)
1702         self.tlb_read(m, r0_stall, tlb_way, dtlb)
1703         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1704                         tlb_way,
1705                         pte, tlb_hit, valid_ra, perm_attr, ra)
1706         self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
1707                         tlb_hit, tlb_plru_victim,
1708                         tlb_way)
1709         self.maybe_plrus(m, r1, plru_victim)
1710         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1711         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1712         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1713                            r0_valid, r1, cache_tags, replace_way,
1714                            use_forward1_next, use_forward2_next,
1715                            req_hit_way, plru_victim, rc_ok, perm_attr,
1716                            valid_ra, perm_ok, access_ok, req_op, req_go,
1717                            tlb_hit, tlb_way, cache_tag_set,
1718                            cancel_store, req_same_tag, r0_stall, early_req_row)
1719         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1720                            r0_valid, r0, reservation)
1721         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1722                            reservation, r0)
1723         self.writeback_control(m, r1, cache_out_row)
1724         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1725         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1726                         req_hit_way, req_index, req_tag, access_ok,
1727                         tlb_hit, tlb_req_index)
1728         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1729                     r0, replace_way,
1730                     req_hit_way, req_same_tag,
1731                          r0_valid, req_op, cache_tags, req_go, ra)
1732         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1733
1734         return m
1735
1736
1737 if __name__ == '__main__':
1738     dut = DCache()
1739     vl = rtlil.convert(dut, ports=[])
1740     with open("test_dcache.il", "w") as f:
1741         f.write(vl)