src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  28 from nmutil.util import Display
  29
  30 from copy import deepcopy
  31 from random import randint, seed
  32
  33 from nmigen_soc.wishbone.bus import Interface
  34
  35 from nmigen.cli import main
  36 from nmutil.iocontrol import RecordObject
  37 from nmigen.utils import log2_int
  38 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  39                                      DCacheToLoadStore1Type,
  40                                      MMUToDCacheType,
  41                                      DCacheToMMUType)
  42
  43 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  44                                 WBAddrType, WBDataType, WBSelType,
  45                                 WBMasterOut, WBSlaveOut,
  46                                 WBMasterOutVector, WBSlaveOutVector,
  47                                 WBIOMasterOut, WBIOSlaveOut)
  48
  49 from soc.experiment.cache_ram import CacheRam
  50 #from soc.experiment.plru import PLRU
  51 from nmutil.plru import PLRU
  52
  53 # for test
  54 from soc.bus.sram import SRAM
  55 from nmigen import Memory
  56 from nmigen.cli import rtlil
  57
  58 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  59 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  60 from nmutil.sim_tmp_alternative import Simulator
  61
  62 from nmutil.util import wrap
  63
  64
  65 # TODO: make these parameters of DCache at some point
  66 LINE_SIZE = 64    # Line size in bytes
  67 NUM_LINES = 16    # Number of lines in a set
  68 NUM_WAYS = 4      # Number of ways
  69 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  70 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  71 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  72 LOG_LENGTH = 0    # Non-zero to enable log data collection
  73
  74 # BRAM organisation: We never access more than
  75 #     -- WB_DATA_BITS at a time so to save
  76 #     -- resources we make the array only that wide, and
  77 #     -- use consecutive indices to make a cache "line"
  78 #     --
  79 #     -- ROW_SIZE is the width in bytes of the BRAM
  80 #     -- (based on WB, so 64-bits)
  81 ROW_SIZE = WB_DATA_BITS // 8;
  82
  83 # ROW_PER_LINE is the number of row (wishbone
  84 # transactions) in a line
  85 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  86
  87 # BRAM_ROWS is the number of rows in BRAM needed
  88 # to represent the full dcache
  89 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  90
  91 print ("ROW_SIZE", ROW_SIZE)
  92 print ("ROW_PER_LINE", ROW_PER_LINE)
  93 print ("BRAM_ROWS", BRAM_ROWS)
  94 print ("NUM_WAYS", NUM_WAYS)
  95
  96 # Bit fields counts in the address
  97
  98 # REAL_ADDR_BITS is the number of real address
  99 # bits that we store
 100 REAL_ADDR_BITS = 56
 101
 102 # ROW_BITS is the number of bits to select a row
 103 ROW_BITS = log2_int(BRAM_ROWS)
 104
 105 # ROW_LINE_BITS is the number of bits to select
 106 # a row within a line
 107 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 108
 109 # LINE_OFF_BITS is the number of bits for
 110 # the offset in a cache line
 111 LINE_OFF_BITS = log2_int(LINE_SIZE)
 112
 113 # ROW_OFF_BITS is the number of bits for
 114 # the offset in a row
 115 ROW_OFF_BITS = log2_int(ROW_SIZE)
 116
 117 # INDEX_BITS is the number if bits to
 118 # select a cache line
 119 INDEX_BITS = log2_int(NUM_LINES)
 120
 121 # SET_SIZE_BITS is the log base 2 of the set size
 122 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 123
 124 # TAG_BITS is the number of bits of
 125 # the tag part of the address
 126 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 127
 128 # TAG_WIDTH is the width in bits of each way of the tag RAM
 129 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 130
 131 # WAY_BITS is the number of bits to select a way
 132 WAY_BITS = log2_int(NUM_WAYS)
 133
 134 # Example of layout for 32 lines of 64 bytes:
 135 layout = """\
 136   ..  tag    |index|  line  |
 137   ..         |   row   |    |
 138   ..         |     |---|    | ROW_LINE_BITS  (3)
 139   ..         |     |--- - --| LINE_OFF_BITS (6)
 140   ..         |         |- --| ROW_OFF_BITS  (3)
 141   ..         |----- ---|    | ROW_BITS      (8)
 142   ..         |-----|        | INDEX_BITS    (5)
 143   .. --------|              | TAG_BITS      (45)
 144 """
 145 print (layout)
 146 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 147             (TAG_BITS, INDEX_BITS, ROW_BITS,
 148              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 149 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 150 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 151 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 152
 153 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 154
 155 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 156
 157 def CacheTagArray():
 158     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 159                         for x in range(NUM_LINES))
 160
 161 def CacheValidBitsArray():
 162     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 163                         for x in range(NUM_LINES))
 164
 165 def RowPerLineValidArray():
 166     return Array(Signal(name="rows_valid%d" % x) \
 167                         for x in range(ROW_PER_LINE))
 168
 169 # L1 TLB
 170 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 171 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 172 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 173 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 174 TLB_PTE_BITS     = 64
 175 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 176
 177 def ispow2(x):
 178     return (1<<log2_int(x, False)) == x
 179
 180 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 181 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 182 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 183 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 184 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 185 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 186         "geometry bits don't add up"
 187 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 188         "geometry bits don't add up"
 189 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 190          "geometry bits don't add up"
 191 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 192 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 193
 194
 195 def TLBValidBitsArray():
 196     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 197                 for x in range(TLB_SET_SIZE))
 198
 199 def TLBTagEAArray():
 200     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 201                 for x in range (TLB_NUM_WAYS))
 202
 203 def TLBTagsArray():
 204     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 205                 for x in range (TLB_SET_SIZE))
 206
 207 def TLBPtesArray():
 208     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 209                 for x in range(TLB_SET_SIZE))
 210
 211 def HitWaySet():
 212     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 213                         for x in range(TLB_NUM_WAYS))
 214
 215 # Cache RAM interface
 216 def CacheRamOut():
 217     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 218                  for x in range(NUM_WAYS))
 219
 220 # PLRU output interface
 221 def PLRUOut():
 222     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 223                 for x in range(NUM_LINES))
 224
 225 # TLB PLRU output interface
 226 def TLBPLRUOut():
 227     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 228                 for x in range(TLB_SET_SIZE))
 229
 230 # Helper functions to decode incoming requests
 231 #
 232 # Return the cache line index (tag index) for an address
 233 def get_index(addr):
 234     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 235
 236 # Return the cache row index (data memory) for an address
 237 def get_row(addr):
 238     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 239
 240 # Return the index of a row within a line
 241 def get_row_of_line(row):
 242     return row[:ROW_BITS][:ROW_LINE_BITS]
 243
 244 # Returns whether this is the last row of a line
 245 def is_last_row_addr(addr, last):
 246     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 247
 248 # Returns whether this is the last row of a line
 249 def is_last_row(row, last):
 250     return get_row_of_line(row) == last
 251
 252 # Return the next row in the current cache line. We use a
 253 # dedicated function in order to limit the size of the
 254 # generated adder to be only the bits within a cache line
 255 # (3 bits with default settings)
 256 def next_row(row):
 257     row_v = row[0:ROW_LINE_BITS] + 1
 258     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 259
 260 # Get the tag value from the address
 261 def get_tag(addr):
 262     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 263
 264 # Read a tag from a tag memory row
 265 def read_tag(way, tagset):
 266     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 267
 268 # Read a TLB tag from a TLB tag memory row
 269 def read_tlb_tag(way, tags):
 270     return tags.word_select(way, TLB_EA_TAG_BITS)
 271
 272 # Write a TLB tag to a TLB tag memory row
 273 def write_tlb_tag(way, tags, tag):
 274     return read_tlb_tag(way, tags).eq(tag)
 275
 276 # Read a PTE from a TLB PTE memory row
 277 def read_tlb_pte(way, ptes):
 278     return ptes.word_select(way, TLB_PTE_BITS)
 279
 280 def write_tlb_pte(way, ptes, newpte):
 281     return read_tlb_pte(way, ptes).eq(newpte)
 282
 283
 284 # Record for storing permission, attribute, etc. bits from a PTE
 285 class PermAttr(RecordObject):
 286     def __init__(self, name=None):
 287         super().__init__(name=name)
 288         self.reference = Signal()
 289         self.changed   = Signal()
 290         self.nocache   = Signal()
 291         self.priv      = Signal()
 292         self.rd_perm   = Signal()
 293         self.wr_perm   = Signal()
 294
 295
 296 def extract_perm_attr(pte):
 297     pa = PermAttr()
 298     return pa;
 299
 300
 301 # Type of operation on a "valid" input
 302 @unique
 303 class Op(Enum):
 304     OP_NONE       = 0
 305     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 306     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 307     OP_LOAD_HIT   = 3 # Cache hit on load
 308     OP_LOAD_MISS  = 4 # Load missing cache
 309     OP_LOAD_NC    = 5 # Non-cachable load
 310     OP_STORE_HIT  = 6 # Store hitting cache
 311     OP_STORE_MISS = 7 # Store missing cache
 312
 313
 314 # Cache state machine
 315 @unique
 316 class State(Enum):
 317     IDLE             = 0 # Normal load hit processing
 318     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 319     STORE_WAIT_ACK   = 2 # Store wait ack
 320     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 321
 322
 323 # Dcache operations:
 324 #
 325 # In order to make timing, we use the BRAMs with
 326 # an output buffer, which means that the BRAM
 327 # output is delayed by an extra cycle.
 328 #
 329 # Thus, the dcache has a 2-stage internal pipeline
 330 # for cache hits with no stalls.
 331 #
 332 # All other operations are handled via stalling
 333 # in the first stage.
 334 #
 335 # The second stage can thus complete a hit at the same
 336 # time as the first stage emits a stall for a complex op.
 337 #
 338 # Stage 0 register, basically contains just the latched request
 339
 340 class RegStage0(RecordObject):
 341     def __init__(self, name=None):
 342         super().__init__(name=name)
 343         self.req     = LoadStore1ToDCacheType(name="lsmem")
 344         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 345         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 346         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 347         self.mmu_req = Signal() # indicates source of request
 348         self.d_valid = Signal() # indicates req.data is valid now
 349
 350
 351 class MemAccessRequest(RecordObject):
 352     def __init__(self, name=None):
 353         super().__init__(name=name)
 354         self.op        = Signal(Op)
 355         self.valid     = Signal()
 356         self.dcbz      = Signal()
 357         self.real_addr = Signal(REAL_ADDR_BITS)
 358         self.data      = Signal(64)
 359         self.byte_sel  = Signal(8)
 360         self.hit_way   = Signal(WAY_BITS)
 361         self.same_tag  = Signal()
 362         self.mmu_req   = Signal()
 363
 364
 365 # First stage register, contains state for stage 1 of load hits
 366 # and for the state machine used by all other operations
 367 class RegStage1(RecordObject):
 368     def __init__(self, name=None):
 369         super().__init__(name=name)
 370         # Info about the request
 371         self.full             = Signal() # have uncompleted request
 372         self.mmu_req          = Signal() # request is from MMU
 373         self.req              = MemAccessRequest(name="reqmem")
 374
 375         # Cache hit state
 376         self.hit_way          = Signal(WAY_BITS)
 377         self.hit_load_valid   = Signal()
 378         self.hit_index        = Signal(INDEX_BITS)
 379         self.cache_hit        = Signal()
 380
 381         # TLB hit state
 382         self.tlb_hit          = Signal()
 383         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 384         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 385
 386         # 2-stage data buffer for data forwarded from writes to reads
 387         self.forward_data1    = Signal(64)
 388         self.forward_data2    = Signal(64)
 389         self.forward_sel1     = Signal(8)
 390         self.forward_valid1   = Signal()
 391         self.forward_way1     = Signal(WAY_BITS)
 392         self.forward_row1     = Signal(ROW_BITS)
 393         self.use_forward1     = Signal()
 394         self.forward_sel      = Signal(8)
 395
 396         # Cache miss state (reload state machine)
 397         self.state            = Signal(State)
 398         self.dcbz             = Signal()
 399         self.write_bram       = Signal()
 400         self.write_tag        = Signal()
 401         self.slow_valid       = Signal()
 402         self.wb               = WBMasterOut("wb")
 403         self.reload_tag       = Signal(TAG_BITS)
 404         self.store_way        = Signal(WAY_BITS)
 405         self.store_row        = Signal(ROW_BITS)
 406         self.store_index      = Signal(INDEX_BITS)
 407         self.end_row_ix       = Signal(ROW_LINE_BITS)
 408         self.rows_valid       = RowPerLineValidArray()
 409         self.acks_pending     = Signal(3)
 410         self.inc_acks         = Signal()
 411         self.dec_acks         = Signal()
 412
 413         # Signals to complete (possibly with error)
 414         self.ls_valid         = Signal()
 415         self.ls_error         = Signal()
 416         self.mmu_done         = Signal()
 417         self.mmu_error        = Signal()
 418         self.cache_paradox    = Signal()
 419
 420         # Signal to complete a failed stcx.
 421         self.stcx_fail        = Signal()
 422
 423
 424 # Reservation information
 425 class Reservation(RecordObject):
 426     def __init__(self):
 427         super().__init__()
 428         self.valid = Signal()
 429         self.addr  = Signal(64-LINE_OFF_BITS)
 430
 431
 432 class DTLBUpdate(Elaboratable):
 433     def __init__(self):
 434         self.tlbie    = Signal()
 435         self.tlbwe    = Signal()
 436         self.doall    = Signal()
 437         self.updated  = Signal()
 438         self.v_updated  = Signal()
 439         self.tlb_hit    = Signal()
 440         self.tlb_req_index = Signal(TLB_SET_BITS)
 441
 442         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 443         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 444         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 445         self.repl_way        = Signal(TLB_WAY_BITS)
 446         self.eatag           = Signal(TLB_EA_TAG_BITS)
 447         self.pte_data        = Signal(TLB_PTE_BITS)
 448
 449         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 450
 451         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 452         self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 453         self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 454
 455     def elaborate(self, platform):
 456         m = Module()
 457         comb = m.d.comb
 458         sync = m.d.sync
 459
 460         tagset   = Signal(TLB_TAG_WAY_BITS)
 461         pteset   = Signal(TLB_PTE_WAY_BITS)
 462
 463         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 464         comb += db_out.eq(self.dv)
 465
 466         with m.If(self.tlbie & self.doall):
 467             pass # clear all back in parent
 468         with m.Elif(self.tlbie):
 469             with m.If(self.tlb_hit):
 470                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(0)
 471                 comb += self.v_updated.eq(1)
 472
 473         with m.Elif(self.tlbwe):
 474
 475             comb += tagset.eq(self.tlb_tag_way)
 476             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 477             comb += tb_out.eq(tagset)
 478
 479             comb += pteset.eq(self.tlb_pte_way)
 480             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 481             comb += pb_out.eq(pteset)
 482
 483             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 484
 485             comb += self.updated.eq(1)
 486             comb += self.v_updated.eq(1)
 487
 488         return m
 489
 490
 491 class DCachePendingHit(Elaboratable):
 492
 493     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 494                       cache_i_validdx, cache_tag_set,
 495                     req_addr,
 496                     hit_set):
 497
 498         self.go          = Signal()
 499         self.virt_mode   = Signal()
 500         self.is_hit      = Signal()
 501         self.tlb_hit     = Signal()
 502         self.hit_way     = Signal(WAY_BITS)
 503         self.rel_match   = Signal()
 504         self.req_index   = Signal(INDEX_BITS)
 505         self.reload_tag  = Signal(TAG_BITS)
 506
 507         self.tlb_hit_way = tlb_hit_way
 508         self.tlb_pte_way = tlb_pte_way
 509         self.tlb_valid_way = tlb_valid_way
 510         self.cache_i_validdx = cache_i_validdx
 511         self.cache_tag_set = cache_tag_set
 512         self.req_addr = req_addr
 513         self.hit_set = hit_set
 514
 515     def elaborate(self, platform):
 516         m = Module()
 517         comb = m.d.comb
 518         sync = m.d.sync
 519
 520         go = self.go
 521         virt_mode = self.virt_mode
 522         is_hit = self.is_hit
 523         tlb_pte_way = self.tlb_pte_way
 524         tlb_valid_way = self.tlb_valid_way
 525         cache_i_validdx = self.cache_i_validdx
 526         cache_tag_set = self.cache_tag_set
 527         req_addr = self.req_addr
 528         tlb_hit_way = self.tlb_hit_way
 529         tlb_hit = self.tlb_hit
 530         hit_set = self.hit_set
 531         hit_way = self.hit_way
 532         rel_match = self.rel_match
 533         req_index = self.req_index
 534         reload_tag = self.reload_tag
 535
 536         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 537                                     for i in range(TLB_NUM_WAYS))
 538         hit_way_set = HitWaySet()
 539
 540         # Test if pending request is a hit on any way
 541         # In order to make timing in virtual mode,
 542         # when we are using the TLB, we compare each
 543         # way with each of the real addresses from each way of
 544         # the TLB, and then decide later which match to use.
 545
 546         with m.If(virt_mode):
 547             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 548                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 549                 s_hit       = Signal()
 550                 s_pte       = Signal(TLB_PTE_BITS)
 551                 s_ra        = Signal(REAL_ADDR_BITS)
 552                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 553                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 554                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 555                 comb += s_tag.eq(get_tag(s_ra))
 556
 557                 for i in range(NUM_WAYS): # way_t
 558                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 559                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 560                                   (read_tag(i, cache_tag_set) == s_tag)
 561                                   & tlb_valid_way[j])
 562                     with m.If(is_tag_hit):
 563                         comb += hit_way_set[j].eq(i)
 564                         comb += s_hit.eq(1)
 565                 comb += hit_set[j].eq(s_hit)
 566                 with m.If(s_tag == reload_tag):
 567                     comb += rel_matches[j].eq(1)
 568             with m.If(tlb_hit):
 569                 comb += is_hit.eq(hit_set[tlb_hit_way])
 570                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 571                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 572         with m.Else():
 573             s_tag       = Signal(TAG_BITS)
 574             comb += s_tag.eq(get_tag(req_addr))
 575             for i in range(NUM_WAYS): # way_t
 576                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 577                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 578                           (read_tag(i, cache_tag_set) == s_tag))
 579                 with m.If(is_tag_hit):
 580                     comb += hit_way.eq(i)
 581                     comb += is_hit.eq(1)
 582             with m.If(s_tag == reload_tag):
 583                 comb += rel_match.eq(1)
 584
 585         return m
 586
 587
 588 class DCache(Elaboratable):
 589     """Set associative dcache write-through
 590
 591     TODO (in no specific order):
 592     * See list in icache.vhdl
 593     * Complete load misses on the cycle when WB data comes instead of
 594       at the end of line (this requires dealing with requests coming in
 595       while not idle...)
 596     """
 597     def __init__(self):
 598         self.d_in      = LoadStore1ToDCacheType("d_in")
 599         self.d_out     = DCacheToLoadStore1Type("d_out")
 600
 601         self.m_in      = MMUToDCacheType("m_in")
 602         self.m_out     = DCacheToMMUType("m_out")
 603
 604         self.stall_out = Signal()
 605
 606         # standard naming (wired to non-standard for compatibility)
 607         self.bus = Interface(addr_width=32,
 608                             data_width=64,
 609                             granularity=8,
 610                             features={'stall'},
 611                             alignment=0,
 612                             name="dcache")
 613
 614         self.log_out   = Signal(20)
 615
 616     def stage_0(self, m, r0, r1, r0_full):
 617         """Latch the request in r0.req as long as we're not stalling
 618         """
 619         comb = m.d.comb
 620         sync = m.d.sync
 621         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 622
 623         r = RegStage0("stage0")
 624
 625         # TODO, this goes in unit tests and formal proofs
 626         with m.If(d_in.valid & m_in.valid):
 627             sync += Display("request collision loadstore vs MMU")
 628
 629         with m.If(m_in.valid):
 630             comb += r.req.valid.eq(1)
 631             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 632             comb += r.req.dcbz.eq(0)
 633             comb += r.req.nc.eq(0)
 634             comb += r.req.reserve.eq(0)
 635             comb += r.req.virt_mode.eq(0)
 636             comb += r.req.priv_mode.eq(1)
 637             comb += r.req.addr.eq(m_in.addr)
 638             comb += r.req.data.eq(m_in.pte)
 639             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 640             comb += r.tlbie.eq(m_in.tlbie)
 641             comb += r.doall.eq(m_in.doall)
 642             comb += r.tlbld.eq(m_in.tlbld)
 643             comb += r.mmu_req.eq(1)
 644             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 645                                  m_in.addr, m_in.pte, r.req.load)
 646
 647         with m.Else():
 648             comb += r.req.eq(d_in)
 649             comb += r.req.data.eq(0)
 650             comb += r.tlbie.eq(0)
 651             comb += r.doall.eq(0)
 652             comb += r.tlbld.eq(0)
 653             comb += r.mmu_req.eq(0)
 654         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 655             sync += r0.eq(r)
 656             sync += r0_full.eq(r.req.valid)
 657             # Sample data the cycle after a request comes in from loadstore1.
 658             # If another request has come in already then the data will get
 659             # put directly into req.data below.
 660             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 661                      ~r0.mmu_req):
 662                 sync += r0.req.data.eq(d_in.data)
 663                 sync += r0.d_valid.eq(1)
 664         with m.If(d_in.valid):
 665             m.d.sync += Display("    DCACHE req cache "
 666                                 "virt %d addr %x data %x ld %d",
 667                                  r.req.virt_mode, r.req.addr,
 668                                  r.req.data, r.req.load)
 669
 670     def tlb_read(self, m, r0_stall, tlb_valid_way,
 671                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 672                  dtlb_tags, dtlb_ptes):
 673         """TLB
 674         Operates in the second cycle on the request latched in r0.req.
 675         TLB updates write the entry at the end of the second cycle.
 676         """
 677         comb = m.d.comb
 678         sync = m.d.sync
 679         m_in, d_in = self.m_in, self.d_in
 680
 681         index    = Signal(TLB_SET_BITS)
 682         addrbits = Signal(TLB_SET_BITS)
 683
 684         amin = TLB_LG_PGSZ
 685         amax = TLB_LG_PGSZ + TLB_SET_BITS
 686
 687         with m.If(m_in.valid):
 688             comb += addrbits.eq(m_in.addr[amin : amax])
 689         with m.Else():
 690             comb += addrbits.eq(d_in.addr[amin : amax])
 691         comb += index.eq(addrbits)
 692
 693         # If we have any op and the previous op isn't finished,
 694         # then keep the same output for next cycle.
 695         with m.If(~r0_stall):
 696             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 697             sync += tlb_tag_way.eq(dtlb_tags[index])
 698             sync += tlb_pte_way.eq(dtlb_ptes[index])
 699
 700     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 701         """Generate TLB PLRUs
 702         """
 703         comb = m.d.comb
 704         sync = m.d.sync
 705
 706         if TLB_NUM_WAYS == 0:
 707             return
 708         for i in range(TLB_SET_SIZE):
 709             # TLB PLRU interface
 710             tlb_plru        = PLRU(TLB_WAY_BITS)
 711             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 712             tlb_plru_acc_en = Signal()
 713
 714             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 715             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 716             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 717             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 718
 719     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 720                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 721                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 722
 723         comb = m.d.comb
 724
 725         hitway = Signal(TLB_WAY_BITS)
 726         hit    = Signal()
 727         eatag  = Signal(TLB_EA_TAG_BITS)
 728
 729         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 730         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 731         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 732
 733         for i in range(TLB_NUM_WAYS):
 734             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 735             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 736             comb += tlb_tag.eq(read_tlb_tag(i, tlb_tag_way))
 737             comb += is_tag_hit.eq(tlb_valid_way[i] & (tlb_tag == eatag))
 738             with m.If(is_tag_hit):
 739                 comb += hitway.eq(i)
 740                 comb += hit.eq(1)
 741
 742         comb += tlb_hit.eq(hit & r0_valid)
 743         comb += tlb_hit_way.eq(hitway)
 744
 745         with m.If(tlb_hit):
 746             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 747         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 748
 749         with m.If(r0.req.virt_mode):
 750             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 751                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 752                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 753             comb += perm_attr.reference.eq(pte[8])
 754             comb += perm_attr.changed.eq(pte[7])
 755             comb += perm_attr.nocache.eq(pte[5])
 756             comb += perm_attr.priv.eq(pte[3])
 757             comb += perm_attr.rd_perm.eq(pte[2])
 758             comb += perm_attr.wr_perm.eq(pte[1])
 759         with m.Else():
 760             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 761                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 762             comb += perm_attr.reference.eq(1)
 763             comb += perm_attr.changed.eq(1)
 764             comb += perm_attr.nocache.eq(0)
 765             comb += perm_attr.priv.eq(1)
 766             comb += perm_attr.rd_perm.eq(1)
 767             comb += perm_attr.wr_perm.eq(1)
 768
 769         with m.If(valid_ra):
 770             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 771                                 r0.req.virt_mode, tlb_hit, ra, pte)
 772             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 773             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 774             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 775             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 776             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 777             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 778
 779     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 780                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 781                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 782
 783         dtlb_valids = TLBValidBitsArray()
 784
 785         comb = m.d.comb
 786         sync = m.d.sync
 787
 788         tlbie    = Signal()
 789         tlbwe    = Signal()
 790
 791         comb += tlbie.eq(r0_valid & r0.tlbie)
 792         comb += tlbwe.eq(r0_valid & r0.tlbld)
 793
 794         m.submodules.tlb_update = d = DTLBUpdate()
 795         with m.If(tlbie & r0.doall):
 796             # clear all valid bits at once
 797             for i in range(TLB_SET_SIZE):
 798                 sync += dtlb_valid_bits[i].eq(0)
 799         with m.If(d.updated):
 800             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 801             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 802         with m.If(d.v_updated):
 803             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 804
 805         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 806
 807         comb += d.tlbie.eq(tlbie)
 808         comb += d.tlbwe.eq(tlbwe)
 809         comb += d.doall.eq(r0.doall)
 810         comb += d.tlb_hit.eq(tlb_hit)
 811         comb += d.tlb_hit_way.eq(tlb_hit_way)
 812         comb += d.tlb_tag_way.eq(tlb_tag_way)
 813         comb += d.tlb_pte_way.eq(tlb_pte_way)
 814         comb += d.tlb_req_index.eq(tlb_req_index)
 815
 816         with m.If(tlb_hit):
 817             comb += d.repl_way.eq(tlb_hit_way)
 818         with m.Else():
 819             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 820         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 821         comb += d.pte_data.eq(r0.req.data)
 822
 823     def maybe_plrus(self, m, r1, plru_victim):
 824         """Generate PLRUs
 825         """
 826         comb = m.d.comb
 827         sync = m.d.sync
 828
 829         if TLB_NUM_WAYS == 0:
 830             return
 831
 832         for i in range(NUM_LINES):
 833             # PLRU interface
 834             plru        = PLRU(WAY_BITS)
 835             setattr(m.submodules, "plru%d" % i, plru)
 836             plru_acc_en = Signal()
 837
 838             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 839             comb += plru.acc_en.eq(plru_acc_en)
 840             comb += plru.acc_i.eq(r1.hit_way)
 841             comb += plru_victim[i].eq(plru.lru_o)
 842
 843     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 844         """Cache tag RAM read port
 845         """
 846         comb = m.d.comb
 847         sync = m.d.sync
 848         m_in, d_in = self.m_in, self.d_in
 849
 850         index = Signal(INDEX_BITS)
 851
 852         with m.If(r0_stall):
 853             comb += index.eq(req_index)
 854         with m.Elif(m_in.valid):
 855             comb += index.eq(get_index(m_in.addr))
 856         with m.Else():
 857             comb += index.eq(get_index(d_in.addr))
 858         sync += cache_tag_set.eq(cache_tags[index])
 859
 860     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 861                        r0_valid, r1, cache_valids, replace_way,
 862                        use_forward1_next, use_forward2_next,
 863                        req_hit_way, plru_victim, rc_ok, perm_attr,
 864                        valid_ra, perm_ok, access_ok, req_op, req_go,
 865                        tlb_pte_way,
 866                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 867                        cancel_store, req_same_tag, r0_stall, early_req_row):
 868         """Cache request parsing and hit detection
 869         """
 870
 871         comb = m.d.comb
 872         m_in, d_in = self.m_in, self.d_in
 873
 874         is_hit      = Signal()
 875         hit_way     = Signal(WAY_BITS)
 876         op          = Signal(Op)
 877         opsel       = Signal(3)
 878         go          = Signal()
 879         nc          = Signal()
 880         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 881                                   for i in range(TLB_NUM_WAYS))
 882         cache_i_validdx = Signal(NUM_WAYS)
 883
 884         # Extract line, row and tag from request
 885         comb += req_index.eq(get_index(r0.req.addr))
 886         comb += req_row.eq(get_row(r0.req.addr))
 887         comb += req_tag.eq(get_tag(ra))
 888
 889         if False: # display on comb is a bit... busy.
 890             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 891                     r0.req.addr, ra, req_index, req_tag, req_row)
 892
 893         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 894         comb += cache_i_validdx.eq(cache_valids[req_index])
 895
 896         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 897                                 tlb_valid_way, tlb_hit_way,
 898                                 cache_i_validdx, cache_tag_set,
 899                                 r0.req.addr,
 900                                 hit_set)
 901
 902         comb += dc.tlb_hit.eq(tlb_hit)
 903         comb += dc.reload_tag.eq(r1.reload_tag)
 904         comb += dc.virt_mode.eq(r0.req.virt_mode)
 905         comb += dc.go.eq(go)
 906         comb += dc.req_index.eq(req_index)
 907         comb += is_hit.eq(dc.is_hit)
 908         comb += hit_way.eq(dc.hit_way)
 909         comb += req_same_tag.eq(dc.rel_match)
 910
 911         # See if the request matches the line currently being reloaded
 912         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 913                   (req_index == r1.store_index) & req_same_tag):
 914             # For a store, consider this a hit even if the row isn't
 915             # valid since it will be by the time we perform the store.
 916             # For a load, check the appropriate row valid bit.
 917             rrow = Signal(ROW_LINE_BITS)
 918             comb += rrow.eq(req_row)
 919             valid = r1.rows_valid[rrow]
 920             comb += is_hit.eq((~r0.req.load) | valid)
 921             comb += hit_way.eq(replace_way)
 922
 923         # Whether to use forwarded data for a load or not
 924         with m.If((get_row(r1.req.real_addr) == req_row) &
 925                   (r1.req.hit_way == hit_way)):
 926             # Only need to consider r1.write_bram here, since if we
 927             # are writing refill data here, then we don't have a
 928             # cache hit this cycle on the line being refilled.
 929             # (There is the possibility that the load following the
 930             # load miss that started the refill could be to the old
 931             # contents of the victim line, since it is a couple of
 932             # cycles after the refill starts before we see the updated
 933             # cache tag. In that case we don't use the bypass.)
 934             comb += use_forward1_next.eq(r1.write_bram)
 935         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 936             comb += use_forward2_next.eq(r1.forward_valid1)
 937
 938         # The way that matched on a hit
 939         comb += req_hit_way.eq(hit_way)
 940
 941         # The way to replace on a miss
 942         with m.If(r1.write_tag):
 943             comb += replace_way.eq(plru_victim[r1.store_index])
 944         with m.Else():
 945             comb += replace_way.eq(r1.store_way)
 946
 947         # work out whether we have permission for this access
 948         # NB we don't yet implement AMR, thus no KUAP
 949         comb += rc_ok.eq(perm_attr.reference
 950                          & (r0.req.load | perm_attr.changed))
 951         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 952                            (perm_attr.wr_perm |
 953                               (r0.req.load & perm_attr.rd_perm)))
 954         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 955         # Combine the request and cache hit status to decide what
 956         # operation needs to be done
 957         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 958         comb += op.eq(Op.OP_NONE)
 959         with m.If(go):
 960             with m.If(~access_ok):
 961                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
 962                                  valid_ra, perm_ok, rc_ok)
 963                 comb += op.eq(Op.OP_BAD)
 964             with m.Elif(cancel_store):
 965                 m.d.sync += Display("DCACHE cancel store")
 966                 comb += op.eq(Op.OP_STCX_FAIL)
 967             with m.Else():
 968                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
 969                                  valid_ra, nc, r0.req.load)
 970                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 971                 with m.Switch(opsel):
 972                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 973                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 974                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 975                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 976                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 977                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 978                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 979                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 980         comb += req_op.eq(op)
 981         comb += req_go.eq(go)
 982
 983         # Version of the row number that is valid one cycle earlier
 984         # in the cases where we need to read the cache data BRAM.
 985         # If we're stalling then we need to keep reading the last
 986         # row requested.
 987         with m.If(~r0_stall):
 988             with m.If(m_in.valid):
 989                 comb += early_req_row.eq(get_row(m_in.addr))
 990             with m.Else():
 991                 comb += early_req_row.eq(get_row(d_in.addr))
 992         with m.Else():
 993             comb += early_req_row.eq(req_row)
 994
 995     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 996                          r0_valid, r0, reservation):
 997         """Handle load-with-reservation and store-conditional instructions
 998         """
 999         comb = m.d.comb
1000
1001         with m.If(r0_valid & r0.req.reserve):
1002             # XXX generate alignment interrupt if address
1003             # is not aligned XXX or if r0.req.nc = '1'
1004             with m.If(r0.req.load):
1005                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1006             with m.Else():
1007                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1008                 with m.If((~reservation.valid) |
1009                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1010                     comb += cancel_store.eq(1)
1011
1012     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1013                         reservation, r0):
1014
1015         comb = m.d.comb
1016         sync = m.d.sync
1017
1018         with m.If(r0_valid & access_ok):
1019             with m.If(clear_rsrv):
1020                 sync += reservation.valid.eq(0)
1021             with m.Elif(set_rsrv):
1022                 sync += reservation.valid.eq(1)
1023                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1024
1025     def writeback_control(self, m, r1, cache_out_row):
1026         """Return data for loads & completion control logic
1027         """
1028         comb = m.d.comb
1029         sync = m.d.sync
1030         d_out, m_out = self.d_out, self.m_out
1031
1032         data_out = Signal(64)
1033         data_fwd = Signal(64)
1034
1035         # Use the bypass if are reading the row that was
1036         # written 1 or 2 cycles ago, including for the
1037         # slow_valid = 1 case (i.e. completing a load
1038         # miss or a non-cacheable load).
1039         with m.If(r1.use_forward1):
1040             comb += data_fwd.eq(r1.forward_data1)
1041         with m.Else():
1042             comb += data_fwd.eq(r1.forward_data2)
1043
1044         comb += data_out.eq(cache_out_row)
1045
1046         for i in range(8):
1047             with m.If(r1.forward_sel[i]):
1048                 dsel = data_fwd.word_select(i, 8)
1049                 comb += data_out.word_select(i, 8).eq(dsel)
1050
1051         comb += d_out.valid.eq(r1.ls_valid)
1052         comb += d_out.data.eq(data_out)
1053         comb += d_out.store_done.eq(~r1.stcx_fail)
1054         comb += d_out.error.eq(r1.ls_error)
1055         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1056
1057         # Outputs to MMU
1058         comb += m_out.done.eq(r1.mmu_done)
1059         comb += m_out.err.eq(r1.mmu_error)
1060         comb += m_out.data.eq(data_out)
1061
1062         # We have a valid load or store hit or we just completed
1063         # a slow op such as a load miss, a NC load or a store
1064         #
1065         # Note: the load hit is delayed by one cycle. However it
1066         # can still not collide with r.slow_valid (well unless I
1067         # miscalculated) because slow_valid can only be set on a
1068         # subsequent request and not on its first cycle (the state
1069         # machine must have advanced), which makes slow_valid
1070         # at least 2 cycles from the previous hit_load_valid.
1071
1072         # Sanity: Only one of these must be set in any given cycle
1073
1074         if False: # TODO: need Display to get this to work
1075             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1076             "unexpected slow_valid collision with stcx_fail"
1077
1078             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1079              "unexpected hit_load_delayed collision with slow_valid"
1080
1081         with m.If(~r1.mmu_req):
1082             # Request came from loadstore1...
1083             # Load hit case is the standard path
1084             with m.If(r1.hit_load_valid):
1085                 sync += Display("completing load hit data=%x", data_out)
1086
1087             # error cases complete without stalling
1088             with m.If(r1.ls_error):
1089                 with m.If(r1.dcbz):
1090                     sync += Display("completing dcbz with error")
1091                 with m.Else():
1092                     sync += Display("completing ld/st with error")
1093
1094             # Slow ops (load miss, NC, stores)
1095             with m.If(r1.slow_valid):
1096                 sync += Display("completing store or load miss adr=%x data=%x",
1097                                 r1.req.real_addr, data_out)
1098
1099         with m.Else():
1100             # Request came from MMU
1101             with m.If(r1.hit_load_valid):
1102                 sync += Display("completing load hit to MMU, data=%x",
1103                                 m_out.data)
1104             # error cases complete without stalling
1105             with m.If(r1.mmu_error):
1106                 sync += Display("combpleting MMU ld with error")
1107
1108             # Slow ops (i.e. load miss)
1109             with m.If(r1.slow_valid):
1110                 sync += Display("completing MMU load miss, adr=%x data=%x",
1111                                 r1.req.real_addr, m_out.data)
1112
1113     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1114         """rams
1115         Generate a cache RAM for each way. This handles the normal
1116         reads, writes from reloads and the special store-hit update
1117         path as well.
1118
1119         Note: the BRAMs have an extra read buffer, meaning the output
1120         is pipelined an extra cycle. This differs from the
1121         icache. The writeback logic needs to take that into
1122         account by using 1-cycle delayed signals for load hits.
1123         """
1124         comb = m.d.comb
1125         bus = self.bus
1126
1127         for i in range(NUM_WAYS):
1128             do_read  = Signal(name="do_rd%d" % i)
1129             rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
1130             do_write = Signal(name="do_wr%d" % i)
1131             wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
1132             wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
1133             wr_sel   = Signal(ROW_SIZE)
1134             wr_sel_m = Signal(ROW_SIZE)
1135             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1136
1137             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1138             setattr(m.submodules, "cacheram_%d" % i, way)
1139
1140             comb += way.rd_en.eq(do_read)
1141             comb += way.rd_addr.eq(rd_addr)
1142             comb += _d_out.eq(way.rd_data_o)
1143             comb += way.wr_sel.eq(wr_sel_m)
1144             comb += way.wr_addr.eq(wr_addr)
1145             comb += way.wr_data.eq(wr_data)
1146
1147             # Cache hit reads
1148             comb += do_read.eq(1)
1149             comb += rd_addr.eq(early_req_row)
1150             with m.If(r1.hit_way == i):
1151                 comb += cache_out_row.eq(_d_out)
1152
1153             # Write mux:
1154             #
1155             # Defaults to wishbone read responses (cache refill)
1156             #
1157             # For timing, the mux on wr_data/sel/addr is not
1158             # dependent on anything other than the current state.
1159
1160             with m.If(r1.write_bram):
1161                 # Write store data to BRAM.  This happens one
1162                 # cycle after the store is in r0.
1163                 comb += wr_data.eq(r1.req.data)
1164                 comb += wr_sel.eq(r1.req.byte_sel)
1165                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1166
1167                 with m.If(i == r1.req.hit_way):
1168                     comb += do_write.eq(1)
1169             with m.Else():
1170                 # Otherwise, we might be doing a reload or a DCBZ
1171                 with m.If(r1.dcbz):
1172                     comb += wr_data.eq(0)
1173                 with m.Else():
1174                     comb += wr_data.eq(bus.dat_r)
1175                 comb += wr_addr.eq(r1.store_row)
1176                 comb += wr_sel.eq(~0) # all 1s
1177
1178                 with m.If((r1.state == State.RELOAD_WAIT_ACK)
1179                           & bus.ack & (replace_way == i)):
1180                     comb += do_write.eq(1)
1181
1182             # Mask write selects with do_write since BRAM
1183             # doesn't have a global write-enable
1184             with m.If(do_write):
1185                 comb += wr_sel_m.eq(wr_sel)
1186
1187     # Cache hit synchronous machine for the easy case.
1188     # This handles load hits.
1189     # It also handles error cases (TLB miss, cache paradox)
1190     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1191                         req_hit_way, req_index, req_tag, access_ok,
1192                         tlb_hit, tlb_hit_way, tlb_req_index):
1193
1194         comb = m.d.comb
1195         sync = m.d.sync
1196
1197         with m.If(req_op != Op.OP_NONE):
1198             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1199                     req_op, r0.req.addr, r0.req.nc,
1200                     req_index, req_tag, req_hit_way)
1201
1202         with m.If(r0_valid):
1203             sync += r1.mmu_req.eq(r0.mmu_req)
1204
1205         # Fast path for load/store hits.
1206         # Set signals for the writeback controls.
1207         sync += r1.hit_way.eq(req_hit_way)
1208         sync += r1.hit_index.eq(req_index)
1209
1210         with m.If(req_op == Op.OP_LOAD_HIT):
1211             sync += r1.hit_load_valid.eq(1)
1212         with m.Else():
1213             sync += r1.hit_load_valid.eq(0)
1214
1215         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1216             sync += r1.cache_hit.eq(1)
1217         with m.Else():
1218             sync += r1.cache_hit.eq(0)
1219
1220         with m.If(req_op == Op.OP_BAD):
1221             sync += Display("Signalling ld/st error "
1222                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1223                             ~r0.mmu_req,r0.mmu_req,access_ok)
1224             sync += r1.ls_error.eq(~r0.mmu_req)
1225             sync += r1.mmu_error.eq(r0.mmu_req)
1226             sync += r1.cache_paradox.eq(access_ok)
1227
1228         with m.Else():
1229             sync += r1.ls_error.eq(0)
1230             sync += r1.mmu_error.eq(0)
1231             sync += r1.cache_paradox.eq(0)
1232
1233         with m.If(req_op == Op.OP_STCX_FAIL):
1234             sync += r1.stcx_fail.eq(1)
1235         with m.Else():
1236             sync += r1.stcx_fail.eq(0)
1237
1238         # Record TLB hit information for updating TLB PLRU
1239         sync += r1.tlb_hit.eq(tlb_hit)
1240         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1241         sync += r1.tlb_hit_index.eq(tlb_req_index)
1242
1243     # Memory accesses are handled by this state machine:
1244     #
1245     #   * Cache load miss/reload (in conjunction with "rams")
1246     #   * Load hits for non-cachable forms
1247     #   * Stores (the collision case is handled in "rams")
1248     #
1249     # All wishbone requests generation is done here.
1250     # This machine operates at stage 1.
1251     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1252                     cache_valids, r0, replace_way,
1253                     req_hit_way, req_same_tag,
1254                     r0_valid, req_op, cache_tags, req_go, ra):
1255
1256         comb = m.d.comb
1257         sync = m.d.sync
1258         bus = self.bus
1259         d_in = self.d_in
1260
1261         req         = MemAccessRequest("mreq_ds")
1262
1263         req_row = Signal(ROW_BITS)
1264         req_idx = Signal(INDEX_BITS)
1265         req_tag = Signal(TAG_BITS)
1266         comb += req_idx.eq(get_index(req.real_addr))
1267         comb += req_row.eq(get_row(req.real_addr))
1268         comb += req_tag.eq(get_tag(req.real_addr))
1269
1270         sync += r1.use_forward1.eq(use_forward1_next)
1271         sync += r1.forward_sel.eq(0)
1272
1273         with m.If(use_forward1_next):
1274             sync += r1.forward_sel.eq(r1.req.byte_sel)
1275         with m.Elif(use_forward2_next):
1276             sync += r1.forward_sel.eq(r1.forward_sel1)
1277
1278         sync += r1.forward_data2.eq(r1.forward_data1)
1279         with m.If(r1.write_bram):
1280             sync += r1.forward_data1.eq(r1.req.data)
1281             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1282             sync += r1.forward_way1.eq(r1.req.hit_way)
1283             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1284             sync += r1.forward_valid1.eq(1)
1285         with m.Else():
1286             with m.If(r1.dcbz):
1287                 sync += r1.forward_data1.eq(0)
1288             with m.Else():
1289                 sync += r1.forward_data1.eq(bus.dat_r)
1290             sync += r1.forward_sel1.eq(~0) # all 1s
1291             sync += r1.forward_way1.eq(replace_way)
1292             sync += r1.forward_row1.eq(r1.store_row)
1293             sync += r1.forward_valid1.eq(0)
1294
1295         # One cycle pulses reset
1296         sync += r1.slow_valid.eq(0)
1297         sync += r1.write_bram.eq(0)
1298         sync += r1.inc_acks.eq(0)
1299         sync += r1.dec_acks.eq(0)
1300
1301         sync += r1.ls_valid.eq(0)
1302         # complete tlbies and TLB loads in the third cycle
1303         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1304
1305         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1306             with m.If(~r0.mmu_req):
1307                 sync += r1.ls_valid.eq(1)
1308             with m.Else():
1309                 sync += r1.mmu_done.eq(1)
1310
1311         with m.If(r1.write_tag):
1312             # Store new tag in selected way
1313             for i in range(NUM_WAYS):
1314                 with m.If(i == replace_way):
1315                     ct = Signal(TAG_RAM_WIDTH)
1316                     comb += ct.eq(cache_tags[r1.store_index])
1317                     """
1318 TODO: check this
1319 cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
1320                     (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
1321                     """
1322                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1323                     sync += cache_tags[r1.store_index].eq(ct)
1324             sync += r1.store_way.eq(replace_way)
1325             sync += r1.write_tag.eq(0)
1326
1327         # Take request from r1.req if there is one there,
1328         # else from req_op, ra, etc.
1329         with m.If(r1.full):
1330             comb += req.eq(r1.req)
1331         with m.Else():
1332             comb += req.op.eq(req_op)
1333             comb += req.valid.eq(req_go)
1334             comb += req.mmu_req.eq(r0.mmu_req)
1335             comb += req.dcbz.eq(r0.req.dcbz)
1336             comb += req.real_addr.eq(ra)
1337
1338             with m.If(r0.req.dcbz):
1339                 # force data to 0 for dcbz
1340                 comb += req.data.eq(0)
1341             with m.Elif(r0.d_valid):
1342                 comb += req.data.eq(r0.req.data)
1343             with m.Else():
1344                 comb += req.data.eq(d_in.data)
1345
1346             # Select all bytes for dcbz
1347             # and for cacheable loads
1348             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1349                 comb += req.byte_sel.eq(~0) # all 1s
1350             with m.Else():
1351                 comb += req.byte_sel.eq(r0.req.byte_sel)
1352             comb += req.hit_way.eq(req_hit_way)
1353             comb += req.same_tag.eq(req_same_tag)
1354
1355             # Store the incoming request from r0,
1356             # if it is a slow request
1357             # Note that r1.full = 1 implies req_op = OP_NONE
1358             with m.If((req_op == Op.OP_LOAD_MISS)
1359                       | (req_op == Op.OP_LOAD_NC)
1360                       | (req_op == Op.OP_STORE_MISS)
1361                       | (req_op == Op.OP_STORE_HIT)):
1362                 sync += r1.req.eq(req)
1363                 sync += r1.full.eq(1)
1364
1365         # Main state machine
1366         with m.Switch(r1.state):
1367
1368             with m.Case(State.IDLE):
1369                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1370                 sync += r1.wb.sel.eq(req.byte_sel)
1371                 sync += r1.wb.dat.eq(req.data)
1372                 sync += r1.dcbz.eq(req.dcbz)
1373
1374                 # Keep track of our index and way
1375                 # for subsequent stores.
1376                 sync += r1.store_index.eq(req_idx)
1377                 sync += r1.store_row.eq(req_row)
1378                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1379                 sync += r1.reload_tag.eq(req_tag)
1380                 sync += r1.req.same_tag.eq(1)
1381
1382                 with m.If(req.op == Op.OP_STORE_HIT):
1383                     sync += r1.store_way.eq(req.hit_way)
1384
1385                 # Reset per-row valid bits,
1386                 # ready for handling OP_LOAD_MISS
1387                 for i in range(ROW_PER_LINE):
1388                     sync += r1.rows_valid[i].eq(0)
1389
1390                 with m.If(req_op != Op.OP_NONE):
1391                     sync += Display("cache op %d", req.op)
1392
1393                 with m.Switch(req.op):
1394                     with m.Case(Op.OP_LOAD_HIT):
1395                         # stay in IDLE state
1396                         pass
1397
1398                     with m.Case(Op.OP_LOAD_MISS):
1399                         sync += Display("cache miss real addr: %x " \
1400                                 "idx: %x tag: %x",
1401                                 req.real_addr, req_row, req_tag)
1402
1403                         # Start the wishbone cycle
1404                         sync += r1.wb.we.eq(0)
1405                         sync += r1.wb.cyc.eq(1)
1406                         sync += r1.wb.stb.eq(1)
1407
1408                         # Track that we had one request sent
1409                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1410                         sync += r1.write_tag.eq(1)
1411
1412                     with m.Case(Op.OP_LOAD_NC):
1413                         sync += r1.wb.cyc.eq(1)
1414                         sync += r1.wb.stb.eq(1)
1415                         sync += r1.wb.we.eq(0)
1416                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1417
1418                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1419                         with m.If(~req.dcbz):
1420                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1421                             sync += r1.acks_pending.eq(1)
1422                             sync += r1.full.eq(0)
1423                             sync += r1.slow_valid.eq(1)
1424
1425                             with m.If(~req.mmu_req):
1426                                 sync += r1.ls_valid.eq(1)
1427                             with m.Else():
1428                                 sync += r1.mmu_done.eq(1)
1429
1430                             with m.If(req.op == Op.OP_STORE_HIT):
1431                                 sync += r1.write_bram.eq(1)
1432                         with m.Else():
1433                             # dcbz is handled much like a load miss except
1434                             # that we are writing to memory instead of reading
1435                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1436
1437                             with m.If(req.op == Op.OP_STORE_MISS):
1438                                 sync += r1.write_tag.eq(1)
1439
1440                         sync += r1.wb.we.eq(1)
1441                         sync += r1.wb.cyc.eq(1)
1442                         sync += r1.wb.stb.eq(1)
1443
1444                     # OP_NONE and OP_BAD do nothing
1445                     # OP_BAD & OP_STCX_FAIL were
1446                     # handled above already
1447                     with m.Case(Op.OP_NONE):
1448                         pass
1449                     with m.Case(Op.OP_BAD):
1450                         pass
1451                     with m.Case(Op.OP_STCX_FAIL):
1452                         pass
1453
1454             with m.Case(State.RELOAD_WAIT_ACK):
1455                 ld_stbs_done = Signal()
1456                 # Requests are all sent if stb is 0
1457                 comb += ld_stbs_done.eq(~r1.wb.stb)
1458
1459                 # If we are still sending requests, was one accepted?
1460                 with m.If((~bus.stall) & r1.wb.stb):
1461                     # That was the last word?  We are done sending.
1462                     # Clear stb and set ld_stbs_done so we can handle an
1463                     # eventual last ack on the same cycle.
1464                     # sigh - reconstruct wb adr with 3 extra 0s at front
1465                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1466                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1467                         sync += r1.wb.stb.eq(0)
1468                         comb += ld_stbs_done.eq(1)
1469
1470                     # Calculate the next row address in the current cache line
1471                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1472                     comb += row.eq(r1.wb.adr)
1473                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1474
1475                 # Incoming acks processing
1476                 sync += r1.forward_valid1.eq(bus.ack)
1477                 with m.If(bus.ack):
1478                     srow = Signal(ROW_LINE_BITS)
1479                     comb += srow.eq(r1.store_row)
1480                     sync += r1.rows_valid[srow].eq(1)
1481
1482                     # If this is the data we were looking for,
1483                     # we can complete the request next cycle.
1484                     # Compare the whole address in case the
1485                     # request in r1.req is not the one that
1486                     # started this refill.
1487                     with m.If(req.valid & r1.req.same_tag &
1488                               ((r1.dcbz & r1.req.dcbz) |
1489                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1490                                 (r1.store_row == get_row(req.real_addr))):
1491                         sync += r1.full.eq(0)
1492                         sync += r1.slow_valid.eq(1)
1493                         with m.If(~r1.mmu_req):
1494                             sync += r1.ls_valid.eq(1)
1495                         with m.Else():
1496                             sync += r1.mmu_done.eq(1)
1497                         sync += r1.forward_sel.eq(~0) # all 1s
1498                         sync += r1.use_forward1.eq(1)
1499
1500                     # Check for completion
1501                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1502                                                       r1.end_row_ix)):
1503                         # Complete wishbone cycle
1504                         sync += r1.wb.cyc.eq(0)
1505
1506                         # Cache line is now valid
1507                         cv = Signal(INDEX_BITS)
1508                         comb += cv.eq(cache_valids[r1.store_index])
1509                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1510                         sync += cache_valids[r1.store_index].eq(cv)
1511
1512                         sync += r1.state.eq(State.IDLE)
1513                         sync += Display("cache valid set %x "
1514                                         "idx %d way %d",
1515                                          cv, r1.store_index, r1.store_way)
1516
1517                     # Increment store row counter
1518                     sync += r1.store_row.eq(next_row(r1.store_row))
1519
1520             with m.Case(State.STORE_WAIT_ACK):
1521                 st_stbs_done = Signal()
1522                 acks        = Signal(3)
1523                 adjust_acks = Signal(3)
1524
1525                 comb += st_stbs_done.eq(~r1.wb.stb)
1526                 comb += acks.eq(r1.acks_pending)
1527
1528                 with m.If(r1.inc_acks != r1.dec_acks):
1529                     with m.If(r1.inc_acks):
1530                         comb += adjust_acks.eq(acks + 1)
1531                     with m.Else():
1532                         comb += adjust_acks.eq(acks - 1)
1533                 with m.Else():
1534                     comb += adjust_acks.eq(acks)
1535
1536                 sync += r1.acks_pending.eq(adjust_acks)
1537
1538                 # Clear stb when slave accepted request
1539                 with m.If(~bus.stall):
1540                     # See if there is another store waiting
1541                     # to be done which is in the same real page.
1542                     with m.If(req.valid):
1543                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1544                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1545                         sync += r1.wb.dat.eq(req.data)
1546                         sync += r1.wb.sel.eq(req.byte_sel)
1547
1548                     with m.If((adjust_acks < 7) & req.same_tag &
1549                                 ((req.op == Op.OP_STORE_MISS)
1550                                  | (req.op == Op.OP_STORE_HIT))):
1551                         sync += r1.wb.stb.eq(1)
1552                         comb += st_stbs_done.eq(0)
1553
1554                         with m.If(req.op == Op.OP_STORE_HIT):
1555                             sync += r1.write_bram.eq(1)
1556                         sync += r1.full.eq(0)
1557                         sync += r1.slow_valid.eq(1)
1558
1559                         # Store requests never come from the MMU
1560                         sync += r1.ls_valid.eq(1)
1561                         comb += st_stbs_done.eq(0)
1562                         sync += r1.inc_acks.eq(1)
1563                     with m.Else():
1564                         sync += r1.wb.stb.eq(0)
1565                         comb += st_stbs_done.eq(1)
1566
1567                 # Got ack ? See if complete.
1568                 with m.If(bus.ack):
1569                     with m.If(st_stbs_done & (adjust_acks == 1)):
1570                         sync += r1.state.eq(State.IDLE)
1571                         sync += r1.wb.cyc.eq(0)
1572                         sync += r1.wb.stb.eq(0)
1573                     sync += r1.dec_acks.eq(1)
1574
1575             with m.Case(State.NC_LOAD_WAIT_ACK):
1576                 # Clear stb when slave accepted request
1577                 with m.If(~bus.stall):
1578                     sync += r1.wb.stb.eq(0)
1579
1580                 # Got ack ? complete.
1581                 with m.If(bus.ack):
1582                     sync += r1.state.eq(State.IDLE)
1583                     sync += r1.full.eq(0)
1584                     sync += r1.slow_valid.eq(1)
1585
1586                     with m.If(~r1.mmu_req):
1587                         sync += r1.ls_valid.eq(1)
1588                     with m.Else():
1589                         sync += r1.mmu_done.eq(1)
1590
1591                     sync += r1.forward_sel.eq(~0) # all 1s
1592                     sync += r1.use_forward1.eq(1)
1593                     sync += r1.wb.cyc.eq(0)
1594                     sync += r1.wb.stb.eq(0)
1595
1596     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1597
1598         sync = m.d.sync
1599         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1600
1601         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1602                                stall_out, req_op[:3], d_out.valid, d_out.error,
1603                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1604                                r1.real_adr[3:6]))
1605
1606     def elaborate(self, platform):
1607
1608         m = Module()
1609         comb = m.d.comb
1610         d_in = self.d_in
1611
1612         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1613         cache_tags       = CacheTagArray()
1614         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1615         cache_valids = CacheValidBitsArray()
1616
1617         # TODO attribute ram_style : string;
1618         # TODO attribute ram_style of cache_tags : signal is "distributed";
1619
1620         """note: these are passed to nmigen.hdl.Memory as "attributes".
1621            don't know how, just that they are.
1622         """
1623         dtlb_valid_bits = TLBValidBitsArray()
1624         dtlb_tags       = TLBTagsArray()
1625         dtlb_ptes       = TLBPtesArray()
1626         # TODO attribute ram_style of
1627         #  dtlb_tags : signal is "distributed";
1628         # TODO attribute ram_style of
1629         #  dtlb_ptes : signal is "distributed";
1630
1631         r0      = RegStage0("r0")
1632         r0_full = Signal()
1633
1634         r1 = RegStage1("r1")
1635
1636         reservation = Reservation()
1637
1638         # Async signals on incoming request
1639         req_index    = Signal(INDEX_BITS)
1640         req_row      = Signal(ROW_BITS)
1641         req_hit_way  = Signal(WAY_BITS)
1642         req_tag      = Signal(TAG_BITS)
1643         req_op       = Signal(Op)
1644         req_data     = Signal(64)
1645         req_same_tag = Signal()
1646         req_go       = Signal()
1647
1648         early_req_row     = Signal(ROW_BITS)
1649
1650         cancel_store      = Signal()
1651         set_rsrv          = Signal()
1652         clear_rsrv        = Signal()
1653
1654         r0_valid          = Signal()
1655         r0_stall          = Signal()
1656
1657         use_forward1_next = Signal()
1658         use_forward2_next = Signal()
1659
1660         cache_out_row     = Signal(WB_DATA_BITS)
1661
1662         plru_victim       = PLRUOut()
1663         replace_way       = Signal(WAY_BITS)
1664
1665         # Wishbone read/write/cache write formatting signals
1666         bus_sel           = Signal(8)
1667
1668         # TLB signals
1669         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1670         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1671         tlb_valid_way = Signal(TLB_NUM_WAYS)
1672         tlb_req_index = Signal(TLB_SET_BITS)
1673         tlb_hit       = Signal()
1674         tlb_hit_way   = Signal(TLB_WAY_BITS)
1675         pte           = Signal(TLB_PTE_BITS)
1676         ra            = Signal(REAL_ADDR_BITS)
1677         valid_ra      = Signal()
1678         perm_attr     = PermAttr("dc_perms")
1679         rc_ok         = Signal()
1680         perm_ok       = Signal()
1681         access_ok     = Signal()
1682
1683         tlb_plru_victim = TLBPLRUOut()
1684
1685         # we don't yet handle collisions between loadstore1 requests
1686         # and MMU requests
1687         comb += self.m_out.stall.eq(0)
1688
1689         # Hold off the request in r0 when r1 has an uncompleted request
1690         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1691         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1692         comb += self.stall_out.eq(r0_stall)
1693
1694
1695         # deal with litex not doing wishbone pipeline mode
1696         # XXX in wrong way.  FIFOs are needed in the SRAM test
1697         # so that stb/ack match up
1698         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1699
1700         # Wire up wishbone request latch out of stage 1
1701         comb += self.bus.we.eq(r1.wb.we)
1702         comb += self.bus.adr.eq(r1.wb.adr)
1703         comb += self.bus.sel.eq(r1.wb.sel)
1704         comb += self.bus.stb.eq(r1.wb.stb)
1705         comb += self.bus.dat_w.eq(r1.wb.dat)
1706         comb += self.bus.cyc.eq(r1.wb.cyc)
1707
1708         # call sub-functions putting everything together, using shared
1709         # signals established above
1710         self.stage_0(m, r0, r1, r0_full)
1711         self.tlb_read(m, r0_stall, tlb_valid_way,
1712                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1713                       dtlb_tags, dtlb_ptes)
1714         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1715                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1716                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1717         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1718                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1719                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1720         self.maybe_plrus(m, r1, plru_victim)
1721         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1722         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1723         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1724                            r0_valid, r1, cache_valids, replace_way,
1725                            use_forward1_next, use_forward2_next,
1726                            req_hit_way, plru_victim, rc_ok, perm_attr,
1727                            valid_ra, perm_ok, access_ok, req_op, req_go,
1728                            tlb_pte_way,
1729                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1730                            cancel_store, req_same_tag, r0_stall, early_req_row)
1731         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1732                            r0_valid, r0, reservation)
1733         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1734                            reservation, r0)
1735         self.writeback_control(m, r1, cache_out_row)
1736         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1737         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1738                         req_hit_way, req_index, req_tag, access_ok,
1739                         tlb_hit, tlb_hit_way, tlb_req_index)
1740         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1741                     cache_valids, r0, replace_way,
1742                     req_hit_way, req_same_tag,
1743                          r0_valid, req_op, cache_tags, req_go, ra)
1744         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1745
1746         return m
1747
1748
1749 if __name__ == '__main__':
1750     dut = DCache()
1751     vl = rtlil.convert(dut, ports=[])
1752     with open("test_dcache.il", "w") as f:
1753         f.write(vl)