src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  17   (discussion about brams for ECP5)
  18
  19 """
  20
  21 import sys
  22
  23 from nmutil.gtkw import write_gtkw
  24
  25 sys.setrecursionlimit(1000000)
  26
  27 from enum import Enum, unique
  28
  29 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  30                     Record, Memory)
  31 from nmutil.util import Display
  32 from nmigen.lib.coding import Decoder
  33
  34 from copy import deepcopy
  35 from random import randint, seed
  36
  37 from nmigen_soc.wishbone.bus import Interface
  38
  39 from nmigen.cli import main
  40 from nmutil.iocontrol import RecordObject
  41 from nmigen.utils import log2_int
  42 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  43                                      DCacheToLoadStore1Type,
  44                                      MMUToDCacheType,
  45                                      DCacheToMMUType)
  46
  47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  48                                 WBAddrType, WBDataType, WBSelType,
  49                                 WBMasterOut, WBSlaveOut,
  50                                 WBMasterOutVector, WBSlaveOutVector,
  51                                 WBIOMasterOut, WBIOSlaveOut)
  52
  53 from soc.experiment.cache_ram import CacheRam
  54 from soc.experiment.plru import PLRU, PLRUs
  55 #from nmutil.plru import PLRU, PLRUs
  56
  57 # for test
  58 from soc.bus.sram import SRAM
  59 from nmigen import Memory
  60 from nmigen.cli import rtlil
  61
  62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  64 from nmutil.sim_tmp_alternative import Simulator
  65
  66 from nmutil.util import wrap
  67
  68
  69 # TODO: make these parameters of DCache at some point
  70 LINE_SIZE = 64    # Line size in bytes
  71 NUM_LINES = 32    # Number of lines in a set
  72 NUM_WAYS = 4      # Number of ways
  73 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  74 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  75 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  76 LOG_LENGTH = 0    # Non-zero to enable log data collection
  77
  78 # BRAM organisation: We never access more than
  79 #     -- WB_DATA_BITS at a time so to save
  80 #     -- resources we make the array only that wide, and
  81 #     -- use consecutive indices to make a cache "line"
  82 #     --
  83 #     -- ROW_SIZE is the width in bytes of the BRAM
  84 #     -- (based on WB, so 64-bits)
  85 ROW_SIZE = WB_DATA_BITS // 8;
  86
  87 # ROW_PER_LINE is the number of row (wishbone
  88 # transactions) in a line
  89 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  90
  91 # BRAM_ROWS is the number of rows in BRAM needed
  92 # to represent the full dcache
  93 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  94
  95 print ("ROW_SIZE", ROW_SIZE)
  96 print ("ROW_PER_LINE", ROW_PER_LINE)
  97 print ("BRAM_ROWS", BRAM_ROWS)
  98 print ("NUM_WAYS", NUM_WAYS)
  99
 100 # Bit fields counts in the address
 101
 102 # REAL_ADDR_BITS is the number of real address
 103 # bits that we store
 104 REAL_ADDR_BITS = 56
 105
 106 # ROW_BITS is the number of bits to select a row
 107 ROW_BITS = log2_int(BRAM_ROWS)
 108
 109 # ROW_LINE_BITS is the number of bits to select
 110 # a row within a line
 111 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 112
 113 # LINE_OFF_BITS is the number of bits for
 114 # the offset in a cache line
 115 LINE_OFF_BITS = log2_int(LINE_SIZE)
 116
 117 # ROW_OFF_BITS is the number of bits for
 118 # the offset in a row
 119 ROW_OFF_BITS = log2_int(ROW_SIZE)
 120
 121 # INDEX_BITS is the number if bits to
 122 # select a cache line
 123 INDEX_BITS = log2_int(NUM_LINES)
 124
 125 # SET_SIZE_BITS is the log base 2 of the set size
 126 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 127
 128 # TAG_BITS is the number of bits of
 129 # the tag part of the address
 130 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 131
 132 # TAG_WIDTH is the width in bits of each way of the tag RAM
 133 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 134
 135 # WAY_BITS is the number of bits to select a way
 136 WAY_BITS = log2_int(NUM_WAYS)
 137
 138 # Example of layout for 32 lines of 64 bytes:
 139 layout = f"""\
 140   DCache Layout:
 141  |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
 142   ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
 143   ..  tag    |index|  line  |
 144   ..         |   row   |    |
 145   ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
 146   ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
 147   ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
 148   ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
 149   ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
 150   .. --------|              | TAG_BITS      ({TAG_BITS})
 151 """
 152 print (layout)
 153 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 154             (TAG_BITS, INDEX_BITS, ROW_BITS,
 155              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 156 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 157 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 158 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 159
 160 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 161
 162 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 163 print ("    TAG_WIDTH", TAG_WIDTH)
 164 print ("     NUM_WAYS", NUM_WAYS)
 165 print ("    NUM_LINES", NUM_LINES)
 166
 167
 168 def CacheTag(name=None):
 169     tag_layout = [('valid', NUM_WAYS),
 170                   ('tag', TAG_RAM_WIDTH),
 171                  ]
 172     return Record(tag_layout, name=name)
 173
 174
 175 def CacheTagArray():
 176     return Array(CacheTag(name="tag%d" % x) for x in range(NUM_LINES))
 177
 178
 179 def RowPerLineValidArray():
 180     return Array(Signal(name="rows_valid%d" % x) \
 181                         for x in range(ROW_PER_LINE))
 182
 183
 184 # L1 TLB
 185 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 186 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 187 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 188 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 189 TLB_PTE_BITS     = 64
 190 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 191
 192 def ispow2(x):
 193     return (1<<log2_int(x, False)) == x
 194
 195 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 196 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 197 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 198 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 199 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 200 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 201         "geometry bits don't add up"
 202 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 203         "geometry bits don't add up"
 204 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 205          "geometry bits don't add up"
 206 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 207 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 208
 209
 210 def TLBHit(name):
 211     return Record([('valid', 1),
 212                    ('way', TLB_WAY_BITS)], name=name)
 213
 214 def TLBTagEAArray():
 215     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 216                 for x in range (TLB_NUM_WAYS))
 217
 218 def TLBRecord(name):
 219     tlb_layout = [('valid', TLB_NUM_WAYS),
 220                   ('tag', TLB_TAG_WAY_BITS),
 221                   ('pte', TLB_PTE_WAY_BITS)
 222                  ]
 223     return Record(tlb_layout, name=name)
 224
 225 def TLBValidArray():
 226     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 227                         for x in range(TLB_SET_SIZE))
 228
 229 def HitWaySet():
 230     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 231                         for x in range(TLB_NUM_WAYS))
 232
 233 # Cache RAM interface
 234 def CacheRamOut():
 235     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 236                  for x in range(NUM_WAYS))
 237
 238 # PLRU output interface
 239 def PLRUOut():
 240     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 241                 for x in range(NUM_LINES))
 242
 243 # TLB PLRU output interface
 244 def TLBPLRUOut():
 245     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 246                 for x in range(TLB_SET_SIZE))
 247
 248 # Helper functions to decode incoming requests
 249 #
 250 # Return the cache line index (tag index) for an address
 251 def get_index(addr):
 252     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 253
 254 # Return the cache row index (data memory) for an address
 255 def get_row(addr):
 256     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 257
 258 # Return the index of a row within a line
 259 def get_row_of_line(row):
 260     return row[:ROW_BITS][:ROW_LINE_BITS]
 261
 262 # Returns whether this is the last row of a line
 263 def is_last_row_addr(addr, last):
 264     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 265
 266 # Returns whether this is the last row of a line
 267 def is_last_row(row, last):
 268     return get_row_of_line(row) == last
 269
 270 # Return the next row in the current cache line. We use a
 271 # dedicated function in order to limit the size of the
 272 # generated adder to be only the bits within a cache line
 273 # (3 bits with default settings)
 274 def next_row(row):
 275     row_v = row[0:ROW_LINE_BITS] + 1
 276     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 277
 278 # Get the tag value from the address
 279 def get_tag(addr):
 280     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 281
 282 # Read a tag from a tag memory row
 283 def read_tag(way, tagset):
 284     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 285
 286 # Read a TLB tag from a TLB tag memory row
 287 def read_tlb_tag(way, tags):
 288     return tags.word_select(way, TLB_EA_TAG_BITS)
 289
 290 # Write a TLB tag to a TLB tag memory row
 291 def write_tlb_tag(way, tags, tag):
 292     return read_tlb_tag(way, tags).eq(tag)
 293
 294 # Read a PTE from a TLB PTE memory row
 295 def read_tlb_pte(way, ptes):
 296     return ptes.word_select(way, TLB_PTE_BITS)
 297
 298 def write_tlb_pte(way, ptes, newpte):
 299     return read_tlb_pte(way, ptes).eq(newpte)
 300
 301
 302 # Record for storing permission, attribute, etc. bits from a PTE
 303 class PermAttr(RecordObject):
 304     def __init__(self, name=None):
 305         super().__init__(name=name)
 306         self.reference = Signal()
 307         self.changed   = Signal()
 308         self.nocache   = Signal()
 309         self.priv      = Signal()
 310         self.rd_perm   = Signal()
 311         self.wr_perm   = Signal()
 312
 313
 314 def extract_perm_attr(pte):
 315     pa = PermAttr()
 316     return pa;
 317
 318
 319 # Type of operation on a "valid" input
 320 @unique
 321 class Op(Enum):
 322     OP_NONE       = 0
 323     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 324     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 325     OP_LOAD_HIT   = 3 # Cache hit on load
 326     OP_LOAD_MISS  = 4 # Load missing cache
 327     OP_LOAD_NC    = 5 # Non-cachable load
 328     OP_STORE_HIT  = 6 # Store hitting cache
 329     OP_STORE_MISS = 7 # Store missing cache
 330
 331
 332 # Cache state machine
 333 @unique
 334 class State(Enum):
 335     IDLE             = 0 # Normal load hit processing
 336     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 337     STORE_WAIT_ACK   = 2 # Store wait ack
 338     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 339
 340
 341 # Dcache operations:
 342 #
 343 # In order to make timing, we use the BRAMs with
 344 # an output buffer, which means that the BRAM
 345 # output is delayed by an extra cycle.
 346 #
 347 # Thus, the dcache has a 2-stage internal pipeline
 348 # for cache hits with no stalls.
 349 #
 350 # All other operations are handled via stalling
 351 # in the first stage.
 352 #
 353 # The second stage can thus complete a hit at the same
 354 # time as the first stage emits a stall for a complex op.
 355 #
 356 # Stage 0 register, basically contains just the latched request
 357
 358 class RegStage0(RecordObject):
 359     def __init__(self, name=None):
 360         super().__init__(name=name)
 361         self.req     = LoadStore1ToDCacheType(name="lsmem")
 362         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 363         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 364         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 365         self.mmu_req = Signal() # indicates source of request
 366         self.d_valid = Signal() # indicates req.data is valid now
 367
 368
 369 class MemAccessRequest(RecordObject):
 370     def __init__(self, name=None):
 371         super().__init__(name=name)
 372         self.op        = Signal(Op)
 373         self.valid     = Signal()
 374         self.dcbz      = Signal()
 375         self.real_addr = Signal(REAL_ADDR_BITS)
 376         self.data      = Signal(64)
 377         self.byte_sel  = Signal(8)
 378         self.hit_way   = Signal(WAY_BITS)
 379         self.same_tag  = Signal()
 380         self.mmu_req   = Signal()
 381
 382
 383 # First stage register, contains state for stage 1 of load hits
 384 # and for the state machine used by all other operations
 385 class RegStage1(RecordObject):
 386     def __init__(self, name=None):
 387         super().__init__(name=name)
 388         # Info about the request
 389         self.full             = Signal() # have uncompleted request
 390         self.mmu_req          = Signal() # request is from MMU
 391         self.req              = MemAccessRequest(name="reqmem")
 392
 393         # Cache hit state
 394         self.hit_way          = Signal(WAY_BITS)
 395         self.hit_load_valid   = Signal()
 396         self.hit_index        = Signal(INDEX_BITS)
 397         self.cache_hit        = Signal()
 398
 399         # TLB hit state
 400         self.tlb_hit          = TLBHit("tlb_hit")
 401         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 402
 403         # 2-stage data buffer for data forwarded from writes to reads
 404         self.forward_data1    = Signal(64)
 405         self.forward_data2    = Signal(64)
 406         self.forward_sel1     = Signal(8)
 407         self.forward_valid1   = Signal()
 408         self.forward_way1     = Signal(WAY_BITS)
 409         self.forward_row1     = Signal(ROW_BITS)
 410         self.use_forward1     = Signal()
 411         self.forward_sel      = Signal(8)
 412
 413         # Cache miss state (reload state machine)
 414         self.state            = Signal(State)
 415         self.dcbz             = Signal()
 416         self.write_bram       = Signal()
 417         self.write_tag        = Signal()
 418         self.slow_valid       = Signal()
 419         self.wb               = WBMasterOut("wb")
 420         self.reload_tag       = Signal(TAG_BITS)
 421         self.store_way        = Signal(WAY_BITS)
 422         self.store_row        = Signal(ROW_BITS)
 423         self.store_index      = Signal(INDEX_BITS)
 424         self.end_row_ix       = Signal(ROW_LINE_BITS)
 425         self.rows_valid       = RowPerLineValidArray()
 426         self.acks_pending     = Signal(3)
 427         self.inc_acks         = Signal()
 428         self.dec_acks         = Signal()
 429
 430         # Signals to complete (possibly with error)
 431         self.ls_valid         = Signal()
 432         self.ls_error         = Signal()
 433         self.mmu_done         = Signal()
 434         self.mmu_error        = Signal()
 435         self.cache_paradox    = Signal()
 436
 437         # Signal to complete a failed stcx.
 438         self.stcx_fail        = Signal()
 439
 440
 441 # Reservation information
 442 class Reservation(RecordObject):
 443     def __init__(self, name=None):
 444         super().__init__(name=name)
 445         self.valid = Signal()
 446         self.addr  = Signal(64-LINE_OFF_BITS)
 447
 448
 449 class DTLBUpdate(Elaboratable):
 450     def __init__(self):
 451         self.tlbie    = Signal()
 452         self.tlbwe    = Signal()
 453         self.doall    = Signal()
 454         self.tlb_hit     = TLBHit("tlb_hit")
 455         self.tlb_req_index = Signal(TLB_SET_BITS)
 456
 457         self.repl_way        = Signal(TLB_WAY_BITS)
 458         self.eatag           = Signal(TLB_EA_TAG_BITS)
 459         self.pte_data        = Signal(TLB_PTE_BITS)
 460
 461         # read from dtlb array
 462         self.tlb_read       = Signal()
 463         self.tlb_read_index = Signal(TLB_SET_BITS)
 464         self.tlb_way        = TLBRecord("o_tlb_way")
 465
 466     def elaborate(self, platform):
 467         m = Module()
 468         comb = m.d.comb
 469         sync = m.d.sync
 470
 471         # there are 3 parts to this:
 472         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 473         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 474         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 475         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 476         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 477         # hmmm....
 478
 479         dtlb_valid = TLBValidArray()
 480         tlb_req_index = self.tlb_req_index
 481
 482         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 483         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 484         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 485         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 486         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 487         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 488
 489         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 490         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 491         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 492         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 493                                     granularity=TLB_EA_TAG_BITS)
 494
 495         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 496         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 497         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 498                                     granularity=TLB_PTE_BITS)
 499
 500         # commented out for now, can be put in if Memory.reset can be
 501         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 502         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 503         #m.submodules.rd_valid = rd_valid = validm.read_port()
 504         #m.submodules.wr_valid = wr_valid = validm.write_port(
 505                                     #granularity=1)
 506
 507         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 508         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 509         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 510         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 511         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 512         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 513         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 514
 515         updated  = Signal()
 516         v_updated  = Signal()
 517         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 518         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 519         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 520         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 521
 522         comb += dv.eq(dtlb_valid[tlb_req_index])
 523         comb += db_out.eq(dv)
 524
 525         with m.If(self.tlbie & self.doall):
 526             # clear all valid bits at once
 527             # XXX hmmm, validm _could_ use Memory reset here...
 528             for i in range(TLB_SET_SIZE):
 529                 sync += dtlb_valid[i].eq(0)
 530         with m.Elif(self.tlbie):
 531             # invalidate just the hit_way
 532             with m.If(self.tlb_hit.valid):
 533                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 534                 comb += v_updated.eq(1)
 535         with m.Elif(self.tlbwe):
 536             # write to the requested tag and PTE
 537             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 538             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 539             # set valid bit
 540             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 541
 542             comb += updated.eq(1)
 543             comb += v_updated.eq(1)
 544
 545         # above, sometimes valid is requested to be updated but data not
 546         # therefore split them out, here.  note the granularity thing matches
 547         # with the shift-up of the eatag/pte_data into the correct TLB way.
 548         # thus is it not necessary to write the entire lot, just the portion
 549         # being altered: hence writing the *old* copy of the row is not needed
 550         with m.If(updated): # PTE and TAG to be written
 551             comb += wr_pteway.data.eq(pb_out)
 552             comb += wr_pteway.en.eq(1<<self.repl_way)
 553             comb += wr_tagway.data.eq(tb_out)
 554             comb += wr_tagway.en.eq(1<<self.repl_way)
 555         with m.If(v_updated): # Valid to be written
 556             sync += dtlb_valid[tlb_req_index].eq(db_out)
 557             #comb += wr_valid.data.eq(db_out)
 558             #comb += wr_valid.en.eq(1<<self.repl_way)
 559
 560         # select one TLB way, use a register here
 561         r_tlb_way        = TLBRecord("r_tlb_way")
 562         r_delay = Signal()
 563         sync += r_delay.eq(self.tlb_read)
 564         with m.If(self.tlb_read):
 565             sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 566         with m.If(r_delay):
 567             # on one clock delay, output the contents of the read port(s)
 568             # comb += self.tlb_way.valid.eq(rd_valid.data)
 569             comb += self.tlb_way.tag.eq(rd_tagway.data)
 570             comb += self.tlb_way.pte.eq(rd_pteway.data)
 571             # and also capture the (delayed) output...
 572             #sync += r_tlb_way.valid.eq(rd_valid.data)
 573             sync += r_tlb_way.tag.eq(rd_tagway.data)
 574             sync += r_tlb_way.pte.eq(rd_pteway.data)
 575         with m.Else():
 576             # ... so that the register can output it when no read is requested
 577             # it's rather overkill but better to be safe than sorry
 578             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 579             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 580             #comb += self.tlb_way.eq(r_tlb_way)
 581
 582         return m
 583
 584
 585 class DCachePendingHit(Elaboratable):
 586
 587     def __init__(self, tlb_way,
 588                       cache_i_validdx, cache_tag_set,
 589                     req_addr):
 590
 591         self.go          = Signal()
 592         self.virt_mode   = Signal()
 593         self.is_hit      = Signal()
 594         self.tlb_hit      = TLBHit("tlb_hit")
 595         self.hit_way     = Signal(WAY_BITS)
 596         self.rel_match   = Signal()
 597         self.req_index   = Signal(INDEX_BITS)
 598         self.reload_tag  = Signal(TAG_BITS)
 599
 600         self.tlb_way = tlb_way
 601         self.cache_i_validdx = cache_i_validdx
 602         self.cache_tag_set = cache_tag_set
 603         self.req_addr = req_addr
 604
 605     def elaborate(self, platform):
 606         m = Module()
 607         comb = m.d.comb
 608         sync = m.d.sync
 609
 610         go = self.go
 611         virt_mode = self.virt_mode
 612         is_hit = self.is_hit
 613         tlb_way = self.tlb_way
 614         cache_i_validdx = self.cache_i_validdx
 615         cache_tag_set = self.cache_tag_set
 616         req_addr = self.req_addr
 617         tlb_hit = self.tlb_hit
 618         hit_way = self.hit_way
 619         rel_match = self.rel_match
 620         req_index = self.req_index
 621         reload_tag = self.reload_tag
 622
 623         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 624                                   for i in range(TLB_NUM_WAYS))
 625         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 626                                     for i in range(TLB_NUM_WAYS))
 627         hit_way_set = HitWaySet()
 628
 629         # Test if pending request is a hit on any way
 630         # In order to make timing in virtual mode,
 631         # when we are using the TLB, we compare each
 632         # way with each of the real addresses from each way of
 633         # the TLB, and then decide later which match to use.
 634
 635         with m.If(virt_mode):
 636             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 637                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 638                 s_hit       = Signal(name="s_hit%d" % j)
 639                 s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
 640                 s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
 641                 # read the PTE, calc the Real Address, get tge tag
 642                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 643                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 644                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 645                 comb += s_tag.eq(get_tag(s_ra))
 646                 # for each way check tge tag against the cache tag set
 647                 for i in range(NUM_WAYS): # way_t
 648                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 649                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 650                                   (read_tag(i, cache_tag_set) == s_tag)
 651                                   & (tlb_way.valid[j]))
 652                     with m.If(is_tag_hit):
 653                         comb += hit_way_set[j].eq(i)
 654                         comb += s_hit.eq(1)
 655                 comb += hit_set[j].eq(s_hit)
 656                 comb += rel_matches[j].eq(s_tag == reload_tag)
 657             with m.If(tlb_hit.valid):
 658                 comb += is_hit.eq(hit_set[tlb_hit.way])
 659                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 660                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 661         with m.Else():
 662             s_tag       = Signal(TAG_BITS)
 663             comb += s_tag.eq(get_tag(req_addr))
 664             for i in range(NUM_WAYS): # way_t
 665                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 666                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 667                           (read_tag(i, cache_tag_set) == s_tag))
 668                 with m.If(is_tag_hit):
 669                     comb += hit_way.eq(i)
 670                     comb += is_hit.eq(1)
 671             with m.If(s_tag == reload_tag):
 672                 comb += rel_match.eq(1)
 673
 674         return m
 675
 676
 677 class DCache(Elaboratable):
 678     """Set associative dcache write-through
 679
 680     TODO (in no specific order):
 681     * See list in icache.vhdl
 682     * Complete load misses on the cycle when WB data comes instead of
 683       at the end of line (this requires dealing with requests coming in
 684       while not idle...)
 685     """
 686     def __init__(self, pspec=None):
 687         self.d_in      = LoadStore1ToDCacheType("d_in")
 688         self.d_out     = DCacheToLoadStore1Type("d_out")
 689
 690         self.m_in      = MMUToDCacheType("m_in")
 691         self.m_out     = DCacheToMMUType("m_out")
 692
 693         self.stall_out = Signal()
 694         self.any_stall_out = Signal()
 695         self.dreq_when_stall = Signal()
 696         self.mreq_when_stall = Signal()
 697
 698         # standard naming (wired to non-standard for compatibility)
 699         self.bus = Interface(addr_width=32,
 700                             data_width=64,
 701                             granularity=8,
 702                             features={'stall'},
 703                             alignment=0,
 704                             name="dcache")
 705
 706         self.log_out   = Signal(20)
 707
 708         # test if microwatt compatibility is to be enabled
 709         self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
 710                                  (pspec.microwatt_compat == True))
 711
 712     def stage_0(self, m, r0, r1, r0_full):
 713         """Latch the request in r0.req as long as we're not stalling
 714         """
 715         comb = m.d.comb
 716         sync = m.d.sync
 717         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 718
 719         r = RegStage0("stage0")
 720
 721         # TODO, this goes in unit tests and formal proofs
 722         with m.If(d_in.valid & m_in.valid):
 723             sync += Display("request collision loadstore vs MMU")
 724
 725         with m.If(m_in.valid):
 726             comb += r.req.valid.eq(1)
 727             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 728             comb += r.req.dcbz.eq(0)
 729             comb += r.req.nc.eq(0)
 730             comb += r.req.reserve.eq(0)
 731             comb += r.req.virt_mode.eq(0)
 732             comb += r.req.priv_mode.eq(1)
 733             comb += r.req.addr.eq(m_in.addr)
 734             comb += r.req.data.eq(m_in.pte)
 735             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 736             comb += r.tlbie.eq(m_in.tlbie)
 737             comb += r.doall.eq(m_in.doall)
 738             comb += r.tlbld.eq(m_in.tlbld)
 739             comb += r.mmu_req.eq(1)
 740             comb += r.d_valid.eq(1)
 741             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 742                                  m_in.addr, m_in.pte, r.req.load)
 743
 744         with m.Else():
 745             comb += r.req.eq(d_in)
 746             comb += r.req.data.eq(0)
 747             comb += r.tlbie.eq(0)
 748             comb += r.doall.eq(0)
 749             comb += r.tlbld.eq(0)
 750             comb += r.mmu_req.eq(0)
 751             comb += r.d_valid.eq(0)
 752
 753         sync += r0_full.eq(0)
 754         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 755             sync += r0.eq(r)
 756             sync += r0_full.eq(r.req.valid)
 757         with m.Elif(~r0.d_valid):
 758             # Sample data the cycle after a request comes in from loadstore1.
 759             # If another request has come in already then the data will get
 760             # put directly into req.data below.
 761             sync += r0.req.data.eq(d_in.data)
 762             sync += r0.d_valid.eq(1)
 763         with m.If(d_in.valid):
 764             m.d.sync += Display("    DCACHE req cache "
 765                                 "virt %d addr %x data %x ld %d",
 766                                  r.req.virt_mode, r.req.addr,
 767                                  r.req.data, r.req.load)
 768
 769     def tlb_read(self, m, r0_stall, tlb_way):
 770         """TLB
 771         Operates in the second cycle on the request latched in r0.req.
 772         TLB updates write the entry at the end of the second cycle.
 773         """
 774         comb = m.d.comb
 775         sync = m.d.sync
 776         m_in, d_in = self.m_in, self.d_in
 777
 778         addrbits = Signal(TLB_SET_BITS)
 779
 780         amin = TLB_LG_PGSZ
 781         amax = TLB_LG_PGSZ + TLB_SET_BITS
 782
 783         with m.If(m_in.valid):
 784             comb += addrbits.eq(m_in.addr[amin : amax])
 785         with m.Else():
 786             comb += addrbits.eq(d_in.addr[amin : amax])
 787
 788         # If we have any op and the previous op isn't finished,
 789         # then keep the same output for next cycle.
 790         d = self.dtlb_update
 791         comb += d.tlb_read_index.eq(addrbits)
 792         comb += d.tlb_read.eq(~r0_stall)
 793         comb += tlb_way.eq(d.tlb_way)
 794
 795     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 796         """Generate TLB PLRUs
 797         """
 798         comb = m.d.comb
 799         sync = m.d.sync
 800
 801         if TLB_NUM_WAYS == 0:
 802             return
 803
 804         # suite of PLRUs with a selection and output mechanism
 805         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 806         m.submodules.tlb_plrus = tlb_plrus
 807         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 808         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 809         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 810         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 811         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 812
 813     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 814                    tlb_way,
 815                    pte, tlb_hit, valid_ra, perm_attr, ra):
 816
 817         comb = m.d.comb
 818
 819         hitway = Signal(TLB_WAY_BITS)
 820         hit    = Signal()
 821         eatag  = Signal(TLB_EA_TAG_BITS)
 822
 823         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 824         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 825         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 826
 827         for i in range(TLB_NUM_WAYS):
 828             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 829             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 830             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 831             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 832             with m.If(is_tag_hit):
 833                 comb += hitway.eq(i)
 834                 comb += hit.eq(1)
 835
 836         comb += tlb_hit.valid.eq(hit & r0_valid)
 837         comb += tlb_hit.way.eq(hitway)
 838
 839         with m.If(tlb_hit.valid):
 840             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 841         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 842
 843         with m.If(r0.req.virt_mode):
 844             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 845                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 846                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 847             comb += perm_attr.reference.eq(pte[8])
 848             comb += perm_attr.changed.eq(pte[7])
 849             comb += perm_attr.nocache.eq(pte[5])
 850             comb += perm_attr.priv.eq(pte[3])
 851             comb += perm_attr.rd_perm.eq(pte[2])
 852             comb += perm_attr.wr_perm.eq(pte[1])
 853         with m.Else():
 854             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 855                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 856             comb += perm_attr.reference.eq(1)
 857             comb += perm_attr.changed.eq(1)
 858             comb += perm_attr.nocache.eq(0)
 859             comb += perm_attr.priv.eq(1)
 860             comb += perm_attr.rd_perm.eq(1)
 861             comb += perm_attr.wr_perm.eq(1)
 862
 863         with m.If(valid_ra):
 864             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 865                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 866             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 867             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 868             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 869             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 870             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 871             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 872
 873     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 874                     tlb_hit, tlb_plru_victim):
 875
 876         comb = m.d.comb
 877         sync = m.d.sync
 878
 879         tlbie    = Signal()
 880         tlbwe    = Signal()
 881
 882         comb += tlbie.eq(r0_valid & r0.tlbie)
 883         comb += tlbwe.eq(r0_valid & r0.tlbld)
 884
 885         d = self.dtlb_update
 886
 887         comb += d.tlbie.eq(tlbie)
 888         comb += d.tlbwe.eq(tlbwe)
 889         comb += d.doall.eq(r0.doall)
 890         comb += d.tlb_hit.eq(tlb_hit)
 891         comb += d.tlb_req_index.eq(tlb_req_index)
 892
 893         with m.If(tlb_hit.valid):
 894             comb += d.repl_way.eq(tlb_hit.way)
 895         with m.Else():
 896             comb += d.repl_way.eq(tlb_plru_victim)
 897         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 898         comb += d.pte_data.eq(r0.req.data)
 899
 900     def maybe_plrus(self, m, r1, plru_victim):
 901         """Generate PLRUs
 902         """
 903         comb = m.d.comb
 904         sync = m.d.sync
 905
 906         if TLB_NUM_WAYS == 0:
 907             return
 908
 909         # suite of PLRUs with a selection and output mechanism
 910         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 911         comb += plrus.way.eq(r1.hit_way)
 912         comb += plrus.valid.eq(r1.cache_hit)
 913         comb += plrus.index.eq(r1.hit_index)
 914         comb += plrus.isel.eq(r1.store_index) # select victim
 915         comb += plru_victim.eq(plrus.o_index) # selected victim
 916
 917     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 918         """Cache tag RAM read port
 919         """
 920         comb = m.d.comb
 921         sync = m.d.sync
 922         m_in, d_in = self.m_in, self.d_in
 923
 924         index = Signal(INDEX_BITS)
 925
 926         with m.If(r0_stall):
 927             comb += index.eq(req_index)
 928         with m.Elif(m_in.valid):
 929             comb += index.eq(get_index(m_in.addr))
 930         with m.Else():
 931             comb += index.eq(get_index(d_in.addr))
 932         sync += cache_tag_set.eq(cache_tags[index].tag)
 933
 934     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 935                        r0_valid, r1, cache_tags, replace_way,
 936                        use_forward1_next, use_forward2_next,
 937                        req_hit_way, plru_victim, rc_ok, perm_attr,
 938                        valid_ra, perm_ok, access_ok, req_op, req_go,
 939                        tlb_hit, tlb_way, cache_tag_set,
 940                        cancel_store, req_same_tag, r0_stall, early_req_row):
 941         """Cache request parsing and hit detection
 942         """
 943
 944         comb = m.d.comb
 945         m_in, d_in = self.m_in, self.d_in
 946
 947         is_hit      = Signal()
 948         hit_way     = Signal(WAY_BITS)
 949         op          = Signal(Op)
 950         opsel       = Signal(3)
 951         go          = Signal()
 952         nc          = Signal()
 953         cache_i_validdx = Signal(NUM_WAYS)
 954
 955         # Extract line, row and tag from request
 956         comb += req_index.eq(get_index(r0.req.addr))
 957         comb += req_row.eq(get_row(r0.req.addr))
 958         comb += req_tag.eq(get_tag(ra))
 959
 960         if False: # display on comb is a bit... busy.
 961             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 962                     r0.req.addr, ra, req_index, req_tag, req_row)
 963
 964         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 965         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 966
 967         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 968                                             cache_i_validdx, cache_tag_set,
 969                                             r0.req.addr)
 970         comb += dc.tlb_hit.eq(tlb_hit)
 971         comb += dc.reload_tag.eq(r1.reload_tag)
 972         comb += dc.virt_mode.eq(r0.req.virt_mode)
 973         comb += dc.go.eq(go)
 974         comb += dc.req_index.eq(req_index)
 975
 976         comb += is_hit.eq(dc.is_hit)
 977         comb += hit_way.eq(dc.hit_way)
 978         comb += req_same_tag.eq(dc.rel_match)
 979
 980         # See if the request matches the line currently being reloaded
 981         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 982                   (req_index == r1.store_index) & req_same_tag):
 983             # For a store, consider this a hit even if the row isn't
 984             # valid since it will be by the time we perform the store.
 985             # For a load, check the appropriate row valid bit.
 986             rrow = Signal(ROW_LINE_BITS)
 987             comb += rrow.eq(req_row)
 988             valid = r1.rows_valid[rrow]
 989             comb += is_hit.eq((~r0.req.load) | valid)
 990             comb += hit_way.eq(replace_way)
 991
 992         # Whether to use forwarded data for a load or not
 993         with m.If((get_row(r1.req.real_addr) == req_row) &
 994                   (r1.req.hit_way == hit_way)):
 995             # Only need to consider r1.write_bram here, since if we
 996             # are writing refill data here, then we don't have a
 997             # cache hit this cycle on the line being refilled.
 998             # (There is the possibility that the load following the
 999             # load miss that started the refill could be to the old
1000             # contents of the victim line, since it is a couple of
1001             # cycles after the refill starts before we see the updated
1002             # cache tag. In that case we don't use the bypass.)
1003             comb += use_forward1_next.eq(r1.write_bram)
1004         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
1005             comb += use_forward2_next.eq(r1.forward_valid1)
1006
1007         # The way that matched on a hit
1008         comb += req_hit_way.eq(hit_way)
1009
1010         # The way to replace on a miss
1011         with m.If(r1.write_tag):
1012             comb += replace_way.eq(plru_victim)
1013         with m.Else():
1014             comb += replace_way.eq(r1.store_way)
1015
1016         # work out whether we have permission for this access
1017         # NB we don't yet implement AMR, thus no KUAP
1018         comb += rc_ok.eq(perm_attr.reference
1019                          & (r0.req.load | perm_attr.changed))
1020         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1021                            (perm_attr.wr_perm |
1022                               (r0.req.load & perm_attr.rd_perm)))
1023         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1024
1025         # Combine the request and cache hit status to decide what
1026         # operation needs to be done
1027         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1028         comb += op.eq(Op.OP_NONE)
1029         with m.If(go):
1030             with m.If(~access_ok):
1031                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1032                                  valid_ra, perm_ok, rc_ok)
1033                 comb += op.eq(Op.OP_BAD)
1034             with m.Elif(cancel_store):
1035                 m.d.sync += Display("DCACHE cancel store")
1036                 comb += op.eq(Op.OP_STCX_FAIL)
1037             with m.Else():
1038                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1039                                  valid_ra, nc, r0.req.load)
1040                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1041                 with m.Switch(opsel):
1042                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1043                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1044                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1045                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1046                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1047                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1048                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1049                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1050         comb += req_op.eq(op)
1051         comb += req_go.eq(go)
1052
1053         # Version of the row number that is valid one cycle earlier
1054         # in the cases where we need to read the cache data BRAM.
1055         # If we're stalling then we need to keep reading the last
1056         # row requested.
1057         with m.If(~r0_stall):
1058             with m.If(m_in.valid):
1059                 comb += early_req_row.eq(get_row(m_in.addr))
1060             with m.Else():
1061                 comb += early_req_row.eq(get_row(d_in.addr))
1062         with m.Else():
1063             comb += early_req_row.eq(req_row)
1064
1065     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1066                          r0_valid, r0, reservation):
1067         """Handle load-with-reservation and store-conditional instructions
1068         """
1069         comb = m.d.comb
1070
1071         with m.If(r0_valid & r0.req.reserve):
1072             # XXX generate alignment interrupt if address
1073             # is not aligned XXX or if r0.req.nc = '1'
1074             with m.If(r0.req.load):
1075                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1076             with m.Else():
1077                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1078                 with m.If((~reservation.valid) |
1079                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1080                     comb += cancel_store.eq(1)
1081
1082     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1083                         reservation, r0):
1084         comb = m.d.comb
1085         sync = m.d.sync
1086
1087         with m.If(r0_valid & access_ok):
1088             with m.If(clear_rsrv):
1089                 sync += reservation.valid.eq(0)
1090             with m.Elif(set_rsrv):
1091                 sync += reservation.valid.eq(1)
1092                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1093
1094     def writeback_control(self, m, r1, cache_out_row):
1095         """Return data for loads & completion control logic
1096         """
1097         comb = m.d.comb
1098         sync = m.d.sync
1099         d_out, m_out = self.d_out, self.m_out
1100
1101         data_out = Signal(64)
1102         data_fwd = Signal(64)
1103
1104         # Use the bypass if are reading the row that was
1105         # written 1 or 2 cycles ago, including for the
1106         # slow_valid = 1 case (i.e. completing a load
1107         # miss or a non-cacheable load).
1108         with m.If(r1.use_forward1):
1109             comb += data_fwd.eq(r1.forward_data1)
1110         with m.Else():
1111             comb += data_fwd.eq(r1.forward_data2)
1112
1113         comb += data_out.eq(cache_out_row)
1114
1115         for i in range(8):
1116             with m.If(r1.forward_sel[i]):
1117                 dsel = data_fwd.word_select(i, 8)
1118                 comb += data_out.word_select(i, 8).eq(dsel)
1119
1120         # DCache output to LoadStore
1121         comb += d_out.valid.eq(r1.ls_valid)
1122         comb += d_out.data.eq(data_out)
1123         comb += d_out.store_done.eq(~r1.stcx_fail)
1124         comb += d_out.error.eq(r1.ls_error)
1125         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1126
1127         # Outputs to MMU
1128         comb += m_out.done.eq(r1.mmu_done)
1129         comb += m_out.err.eq(r1.mmu_error)
1130         comb += m_out.data.eq(data_out)
1131
1132         # We have a valid load or store hit or we just completed
1133         # a slow op such as a load miss, a NC load or a store
1134         #
1135         # Note: the load hit is delayed by one cycle. However it
1136         # can still not collide with r.slow_valid (well unless I
1137         # miscalculated) because slow_valid can only be set on a
1138         # subsequent request and not on its first cycle (the state
1139         # machine must have advanced), which makes slow_valid
1140         # at least 2 cycles from the previous hit_load_valid.
1141
1142         # Sanity: Only one of these must be set in any given cycle
1143
1144         if False: # TODO: need Display to get this to work
1145             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1146             "unexpected slow_valid collision with stcx_fail"
1147
1148             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1149              "unexpected hit_load_delayed collision with slow_valid"
1150
1151         with m.If(~r1.mmu_req):
1152             # Request came from loadstore1...
1153             # Load hit case is the standard path
1154             with m.If(r1.hit_load_valid):
1155                 sync += Display("completing load hit data=%x", data_out)
1156
1157             # error cases complete without stalling
1158             with m.If(r1.ls_error):
1159                 with m.If(r1.dcbz):
1160                     sync += Display("completing dcbz with error")
1161                 with m.Else():
1162                     sync += Display("completing ld/st with error")
1163
1164             # Slow ops (load miss, NC, stores)
1165             with m.If(r1.slow_valid):
1166                 sync += Display("completing store or load miss adr=%x data=%x",
1167                                 r1.req.real_addr, data_out)
1168
1169         with m.Else():
1170             # Request came from MMU
1171             with m.If(r1.hit_load_valid):
1172                 sync += Display("completing load hit to MMU, data=%x",
1173                                 m_out.data)
1174             # error cases complete without stalling
1175             with m.If(r1.mmu_error):
1176                 sync += Display("combpleting MMU ld with error")
1177
1178             # Slow ops (i.e. load miss)
1179             with m.If(r1.slow_valid):
1180                 sync += Display("completing MMU load miss, adr=%x data=%x",
1181                                 r1.req.real_addr, m_out.data)
1182
1183     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1184         """rams
1185         Generate a cache RAM for each way. This handles the normal
1186         reads, writes from reloads and the special store-hit update
1187         path as well.
1188
1189         Note: the BRAMs have an extra read buffer, meaning the output
1190         is pipelined an extra cycle. This differs from the
1191         icache. The writeback logic needs to take that into
1192         account by using 1-cycle delayed signals for load hits.
1193         """
1194         comb = m.d.comb
1195         bus = self.bus
1196
1197         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1198         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1199         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1200         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1201                    ~r1.write_bram))
1202         comb += rwe.i.eq(replace_way)
1203
1204         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1205         comb += hwe.i.eq(r1.hit_way)
1206
1207         # this one is gated with write_bram, and replace_way_e can never be
1208         # set at the same time.  that means that do_write can OR the outputs
1209         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1210         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1211         comb += hre.i.eq(r1.req.hit_way)
1212
1213         # common Signals
1214         do_read  = Signal()
1215         wr_addr  = Signal(ROW_BITS)
1216         wr_data  = Signal(WB_DATA_BITS)
1217         wr_sel   = Signal(ROW_SIZE)
1218         rd_addr  = Signal(ROW_BITS)
1219
1220         comb += do_read.eq(1) # always enable
1221         comb += rd_addr.eq(early_req_row)
1222
1223         # Write mux:
1224         #
1225         # Defaults to wishbone read responses (cache refill)
1226         #
1227         # For timing, the mux on wr_data/sel/addr is not
1228         # dependent on anything other than the current state.
1229
1230         with m.If(r1.write_bram):
1231             # Write store data to BRAM.  This happens one
1232             # cycle after the store is in r0.
1233             comb += wr_data.eq(r1.req.data)
1234             comb += wr_sel.eq(r1.req.byte_sel)
1235             comb += wr_addr.eq(get_row(r1.req.real_addr))
1236
1237         with m.Else():
1238             # Otherwise, we might be doing a reload or a DCBZ
1239             with m.If(r1.dcbz):
1240                 comb += wr_data.eq(0)
1241             with m.Else():
1242                 comb += wr_data.eq(bus.dat_r)
1243             comb += wr_addr.eq(r1.store_row)
1244             comb += wr_sel.eq(~0) # all 1s
1245
1246         # set up Cache Rams
1247         for i in range(NUM_WAYS):
1248             do_write = Signal(name="do_wr%d" % i)
1249             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1250             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1251
1252             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1253             m.submodules["cacheram_%d" % i] = way
1254
1255             comb += way.rd_en.eq(do_read)
1256             comb += way.rd_addr.eq(rd_addr)
1257             comb += d_out.eq(way.rd_data_o)
1258             comb += way.wr_sel.eq(wr_sel_m)
1259             comb += way.wr_addr.eq(wr_addr)
1260             comb += way.wr_data.eq(wr_data)
1261
1262             # Cache hit reads
1263             with m.If(hwe.o[i]):
1264                 comb += cache_out_row.eq(d_out)
1265
1266             # these are mutually-exclusive via their Decoder-enablers
1267             # (note: Decoder-enable is inverted)
1268             comb += do_write.eq(hre.o[i] | rwe.o[i])
1269
1270             # Mask write selects with do_write since BRAM
1271             # doesn't have a global write-enable
1272             with m.If(do_write):
1273                 comb += wr_sel_m.eq(wr_sel)
1274
1275     # Cache hit synchronous machine for the easy case.
1276     # This handles load hits.
1277     # It also handles error cases (TLB miss, cache paradox)
1278     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1279                         req_hit_way, req_index, req_tag, access_ok,
1280                         tlb_hit, tlb_req_index):
1281         comb = m.d.comb
1282         sync = m.d.sync
1283
1284         with m.If(req_op != Op.OP_NONE):
1285             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1286                     req_op, r0.req.addr, r0.req.nc,
1287                     req_index, req_tag, req_hit_way)
1288
1289         with m.If(r0_valid):
1290             sync += r1.mmu_req.eq(r0.mmu_req)
1291
1292         # Fast path for load/store hits.
1293         # Set signals for the writeback controls.
1294         sync += r1.hit_way.eq(req_hit_way)
1295         sync += r1.hit_index.eq(req_index)
1296
1297         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1298         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1299                                 (req_op == Op.OP_STORE_HIT))
1300
1301         with m.If(req_op == Op.OP_BAD):
1302             sync += Display("Signalling ld/st error "
1303                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1304                             ~r0.mmu_req,r0.mmu_req,access_ok)
1305             sync += r1.ls_error.eq(~r0.mmu_req)
1306             sync += r1.mmu_error.eq(r0.mmu_req)
1307             sync += r1.cache_paradox.eq(access_ok)
1308         with m.Else():
1309             sync += r1.ls_error.eq(0)
1310             sync += r1.mmu_error.eq(0)
1311             sync += r1.cache_paradox.eq(0)
1312
1313         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1314
1315         # Record TLB hit information for updating TLB PLRU
1316         sync += r1.tlb_hit.eq(tlb_hit)
1317         sync += r1.tlb_hit_index.eq(tlb_req_index)
1318
1319     # Memory accesses are handled by this state machine:
1320     #
1321     #   * Cache load miss/reload (in conjunction with "rams")
1322     #   * Load hits for non-cachable forms
1323     #   * Stores (the collision case is handled in "rams")
1324     #
1325     # All wishbone requests generation is done here.
1326     # This machine operates at stage 1.
1327     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1328                     r0, replace_way,
1329                     req_hit_way, req_same_tag,
1330                     r0_valid, req_op, cache_tags, req_go, ra):
1331
1332         comb = m.d.comb
1333         sync = m.d.sync
1334         bus = self.bus
1335         d_in = self.d_in
1336
1337         req         = MemAccessRequest("mreq_ds")
1338
1339         r1_next_cycle = Signal()
1340         req_row = Signal(ROW_BITS)
1341         req_idx = Signal(INDEX_BITS)
1342         req_tag = Signal(TAG_BITS)
1343         comb += req_idx.eq(get_index(req.real_addr))
1344         comb += req_row.eq(get_row(req.real_addr))
1345         comb += req_tag.eq(get_tag(req.real_addr))
1346
1347         sync += r1.use_forward1.eq(use_forward1_next)
1348         sync += r1.forward_sel.eq(0)
1349
1350         with m.If(use_forward1_next):
1351             sync += r1.forward_sel.eq(r1.req.byte_sel)
1352         with m.Elif(use_forward2_next):
1353             sync += r1.forward_sel.eq(r1.forward_sel1)
1354
1355         sync += r1.forward_data2.eq(r1.forward_data1)
1356         with m.If(r1.write_bram):
1357             sync += r1.forward_data1.eq(r1.req.data)
1358             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1359             sync += r1.forward_way1.eq(r1.req.hit_way)
1360             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1361             sync += r1.forward_valid1.eq(1)
1362         with m.Else():
1363             with m.If(r1.dcbz):
1364                 sync += r1.forward_data1.eq(0)
1365             with m.Else():
1366                 sync += r1.forward_data1.eq(bus.dat_r)
1367             sync += r1.forward_sel1.eq(~0) # all 1s
1368             sync += r1.forward_way1.eq(replace_way)
1369             sync += r1.forward_row1.eq(r1.store_row)
1370             sync += r1.forward_valid1.eq(0)
1371
1372         # One cycle pulses reset
1373         sync += r1.slow_valid.eq(0)
1374         sync += r1.write_bram.eq(0)
1375         sync += r1.inc_acks.eq(0)
1376         sync += r1.dec_acks.eq(0)
1377
1378         sync += r1.ls_valid.eq(0)
1379         # complete tlbies and TLB loads in the third cycle
1380         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1381
1382         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1383             with m.If(r0.mmu_req):
1384                 sync += r1.mmu_done.eq(1)
1385             with m.Else():
1386                 sync += r1.ls_valid.eq(1)
1387
1388         with m.If(r1.write_tag):
1389             # Store new tag in selected way
1390             replace_way_onehot = Signal(NUM_WAYS)
1391             comb += replace_way_onehot.eq(1<<replace_way)
1392             for i in range(NUM_WAYS):
1393                 with m.If(replace_way_onehot[i]):
1394                     ct = Signal(TAG_RAM_WIDTH)
1395                     comb += ct.eq(cache_tags[r1.store_index].tag)
1396                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1397                     sync += cache_tags[r1.store_index].tag.eq(ct)
1398             sync += r1.store_way.eq(replace_way)
1399             sync += r1.write_tag.eq(0)
1400
1401         # Take request from r1.req if there is one there,
1402         # else from req_op, ra, etc.
1403         with m.If(r1.full):
1404             comb += req.eq(r1.req)
1405         with m.Else():
1406             comb += req.op.eq(req_op)
1407             comb += req.valid.eq(req_go)
1408             comb += req.mmu_req.eq(r0.mmu_req)
1409             comb += req.dcbz.eq(r0.req.dcbz)
1410             comb += req.real_addr.eq(ra)
1411
1412             with m.If(r0.req.dcbz):
1413                 # force data to 0 for dcbz
1414                 comb += req.data.eq(0)
1415             with m.Elif(r0.d_valid):
1416                 comb += req.data.eq(r0.req.data)
1417             with m.Else():
1418                 comb += req.data.eq(d_in.data)
1419
1420             # Select all bytes for dcbz
1421             # and for cacheable loads
1422             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1423                 comb += req.byte_sel.eq(~0) # all 1s
1424             with m.Else():
1425                 comb += req.byte_sel.eq(r0.req.byte_sel)
1426             comb += req.hit_way.eq(req_hit_way)
1427             comb += req.same_tag.eq(req_same_tag)
1428
1429             # Store the incoming request from r0,
1430             # if it is a slow request
1431             # Note that r1.full = 1 implies req_op = OP_NONE
1432             with m.If((req_op == Op.OP_LOAD_MISS)
1433                       | (req_op == Op.OP_LOAD_NC)
1434                       | (req_op == Op.OP_STORE_MISS)
1435                       | (req_op == Op.OP_STORE_HIT)):
1436                 sync += r1.req.eq(req)
1437                 sync += r1.full.eq(1)
1438                 # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
1439                 # destroy r1.req by overwriting r1.full back to zero
1440                 comb += r1_next_cycle.eq(1)
1441
1442         # Main state machine
1443         with m.Switch(r1.state):
1444
1445             with m.Case(State.IDLE):
1446                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1447                 sync += r1.wb.sel.eq(req.byte_sel)
1448                 sync += r1.wb.dat.eq(req.data)
1449                 sync += r1.dcbz.eq(req.dcbz)
1450
1451                 # Keep track of our index and way
1452                 # for subsequent stores.
1453                 sync += r1.store_index.eq(req_idx)
1454                 sync += r1.store_row.eq(req_row)
1455                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1456                 sync += r1.reload_tag.eq(req_tag)
1457                 sync += r1.req.same_tag.eq(1)
1458
1459                 with m.If(req.op == Op.OP_STORE_HIT):
1460                     sync += r1.store_way.eq(req.hit_way)
1461
1462                 #with m.If(r1.dec_acks):
1463                 #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
1464
1465                 # Reset per-row valid bits,
1466                 # ready for handling OP_LOAD_MISS
1467                 for i in range(ROW_PER_LINE):
1468                     sync += r1.rows_valid[i].eq(0)
1469
1470                 with m.If(req_op != Op.OP_NONE):
1471                     sync += Display("cache op %d", req.op)
1472
1473                 with m.Switch(req.op):
1474                     with m.Case(Op.OP_LOAD_HIT):
1475                         # stay in IDLE state
1476                         pass
1477
1478                     with m.Case(Op.OP_LOAD_MISS):
1479                         sync += Display("cache miss real addr: %x " \
1480                                 "idx: %x tag: %x",
1481                                 req.real_addr, req_row, req_tag)
1482
1483                         # Start the wishbone cycle
1484                         sync += r1.wb.we.eq(0)
1485                         sync += r1.wb.cyc.eq(1)
1486                         sync += r1.wb.stb.eq(1)
1487
1488                         # Track that we had one request sent
1489                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1490                         sync += r1.write_tag.eq(1)
1491
1492                     with m.Case(Op.OP_LOAD_NC):
1493                         sync += r1.wb.cyc.eq(1)
1494                         sync += r1.wb.stb.eq(1)
1495                         sync += r1.wb.we.eq(0)
1496                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1497
1498                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1499                         with m.If(~req.dcbz):
1500                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1501                             sync += r1.acks_pending.eq(1)
1502                             sync += r1.full.eq(0)
1503                             comb += r1_next_cycle.eq(0)
1504                             sync += r1.slow_valid.eq(1)
1505
1506                             with m.If(req.mmu_req):
1507                                 sync += r1.mmu_done.eq(1)
1508                             with m.Else():
1509                                 sync += r1.ls_valid.eq(1)
1510
1511                             with m.If(req.op == Op.OP_STORE_HIT):
1512                                 sync += r1.write_bram.eq(1)
1513                         with m.Else():
1514                             # dcbz is handled much like a load miss except
1515                             # that we are writing to memory instead of reading
1516                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1517
1518                             with m.If(req.op == Op.OP_STORE_MISS):
1519                                 sync += r1.write_tag.eq(1)
1520
1521                         sync += r1.wb.we.eq(1)
1522                         sync += r1.wb.cyc.eq(1)
1523                         sync += r1.wb.stb.eq(1)
1524
1525                     # OP_NONE and OP_BAD do nothing
1526                     # OP_BAD & OP_STCX_FAIL were
1527                     # handled above already
1528                     with m.Case(Op.OP_NONE):
1529                         pass
1530                     with m.Case(Op.OP_BAD):
1531                         pass
1532                     with m.Case(Op.OP_STCX_FAIL):
1533                         pass
1534
1535             with m.Case(State.RELOAD_WAIT_ACK):
1536                 ld_stbs_done = Signal()
1537                 # Requests are all sent if stb is 0
1538                 comb += ld_stbs_done.eq(~r1.wb.stb)
1539
1540                 # If we are still sending requests, was one accepted?
1541                 with m.If((~bus.stall) & r1.wb.stb):
1542                     # That was the last word?  We are done sending.
1543                     # Clear stb and set ld_stbs_done so we can handle an
1544                     # eventual last ack on the same cycle.
1545                     # sigh - reconstruct wb adr with 3 extra 0s at front
1546                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1547                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1548                         sync += r1.wb.stb.eq(0)
1549                         comb += ld_stbs_done.eq(1)
1550
1551                     # Calculate the next row address in the current cache line
1552                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1553                     comb += row.eq(r1.wb.adr)
1554                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1555
1556                 # Incoming acks processing
1557                 sync += r1.forward_valid1.eq(bus.ack)
1558                 with m.If(bus.ack):
1559                     srow = Signal(ROW_LINE_BITS)
1560                     comb += srow.eq(r1.store_row)
1561                     sync += r1.rows_valid[srow].eq(1)
1562
1563                     # If this is the data we were looking for,
1564                     # we can complete the request next cycle.
1565                     # Compare the whole address in case the
1566                     # request in r1.req is not the one that
1567                     # started this refill.
1568                     with m.If(r1.full & r1.req.same_tag &
1569                               ((r1.dcbz & req.dcbz) |
1570                                (r1.req.op == Op.OP_LOAD_MISS)) &
1571                                 (r1.store_row == get_row(r1.req.real_addr))):
1572                         sync += r1.full.eq(r1_next_cycle)
1573                         sync += r1.slow_valid.eq(1)
1574                         with m.If(r1.mmu_req):
1575                             sync += r1.mmu_done.eq(1)
1576                         with m.Else():
1577                             sync += r1.ls_valid.eq(1)
1578                         sync += r1.forward_sel.eq(~0) # all 1s
1579                         sync += r1.use_forward1.eq(1)
1580
1581                     # Check for completion
1582                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1583                                                       r1.end_row_ix)):
1584                         # Complete wishbone cycle
1585                         sync += r1.wb.cyc.eq(0)
1586
1587                         # Cache line is now valid
1588                         cv = Signal(INDEX_BITS)
1589                         comb += cv.eq(cache_tags[r1.store_index].valid)
1590                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1591                         sync += cache_tags[r1.store_index].valid.eq(cv)
1592
1593                         sync += r1.state.eq(State.IDLE)
1594                         sync += Display("cache valid set %x "
1595                                         "idx %d way %d",
1596                                          cv, r1.store_index, r1.store_way)
1597
1598                     # Increment store row counter
1599                     sync += r1.store_row.eq(next_row(r1.store_row))
1600
1601             with m.Case(State.STORE_WAIT_ACK):
1602                 st_stbs_done = Signal()
1603                 adjust_acks = Signal(3)
1604
1605                 comb += st_stbs_done.eq(~r1.wb.stb)
1606
1607                 with m.If(r1.inc_acks != r1.dec_acks):
1608                     with m.If(r1.inc_acks):
1609                         comb += adjust_acks.eq(r1.acks_pending + 1)
1610                     with m.Else():
1611                         comb += adjust_acks.eq(r1.acks_pending - 1)
1612                 with m.Else():
1613                     comb += adjust_acks.eq(r1.acks_pending)
1614
1615                 sync += r1.acks_pending.eq(adjust_acks)
1616
1617                 # Clear stb when slave accepted request
1618                 with m.If(~bus.stall):
1619                     # See if there is another store waiting
1620                     # to be done which is in the same real page.
1621                     with m.If(req.valid):
1622                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1623                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1624                         sync += r1.wb.dat.eq(req.data)
1625                         sync += r1.wb.sel.eq(req.byte_sel)
1626
1627                     with m.If((adjust_acks < 7) & req.same_tag &
1628                                 ((req.op == Op.OP_STORE_MISS) |
1629                                  (req.op == Op.OP_STORE_HIT))):
1630                         sync += r1.wb.stb.eq(1)
1631                         comb += st_stbs_done.eq(0)
1632                         sync += r1.store_way.eq(req.hit_way)
1633                         sync += r1.store_row.eq(get_row(req.real_addr))
1634
1635                         with m.If(req.op == Op.OP_STORE_HIT):
1636                             sync += r1.write_bram.eq(1)
1637                         sync += r1.full.eq(r1_next_cycle)
1638                         sync += r1.slow_valid.eq(1)
1639
1640                         # Store requests never come from the MMU
1641                         sync += r1.ls_valid.eq(1)
1642                         comb += st_stbs_done.eq(0)
1643                         sync += r1.inc_acks.eq(1)
1644                     with m.Else():
1645                         sync += r1.wb.stb.eq(0)
1646                         comb += st_stbs_done.eq(1)
1647
1648                 # Got ack ? See if complete.
1649                 sync += Display("got ack %d %d stbs %d adjust_acks %d",
1650                                 bus.ack, bus.ack, st_stbs_done, adjust_acks)
1651                 with m.If(bus.ack):
1652                     with m.If(st_stbs_done & (adjust_acks == 1)):
1653                         sync += r1.state.eq(State.IDLE)
1654                         sync += r1.wb.cyc.eq(0)
1655                         sync += r1.wb.stb.eq(0)
1656                     sync += r1.dec_acks.eq(1)
1657
1658             with m.Case(State.NC_LOAD_WAIT_ACK):
1659                 # Clear stb when slave accepted request
1660                 with m.If(~bus.stall):
1661                     sync += r1.wb.stb.eq(0)
1662
1663                 # Got ack ? complete.
1664                 with m.If(bus.ack):
1665                     sync += r1.state.eq(State.IDLE)
1666                     sync += r1.full.eq(r1_next_cycle)
1667                     sync += r1.slow_valid.eq(1)
1668
1669                     with m.If(r1.mmu_req):
1670                         sync += r1.mmu_done.eq(1)
1671                     with m.Else():
1672                         sync += r1.ls_valid.eq(1)
1673
1674                     sync += r1.forward_sel.eq(~0) # all 1s
1675                     sync += r1.use_forward1.eq(1)
1676                     sync += r1.wb.cyc.eq(0)
1677                     sync += r1.wb.stb.eq(0)
1678
1679     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1680
1681         sync = m.d.sync
1682         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1683
1684         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1685                                stall_out, req_op[:3], d_out.valid, d_out.error,
1686                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1687                                r1.real_adr[3:6]))
1688
1689     def elaborate(self, platform):
1690
1691         m = Module()
1692         comb, sync = m.d.comb, m.d.sync
1693         m_in, d_in = self.m_in, self.d_in
1694
1695         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1696         cache_tags       = CacheTagArray()
1697         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1698
1699         # TODO attribute ram_style : string;
1700         # TODO attribute ram_style of cache_tags : signal is "distributed";
1701
1702         """note: these are passed to nmigen.hdl.Memory as "attributes".
1703            don't know how, just that they are.
1704         """
1705         # TODO attribute ram_style of
1706         #  dtlb_tags : signal is "distributed";
1707         # TODO attribute ram_style of
1708         #  dtlb_ptes : signal is "distributed";
1709
1710         r0      = RegStage0("r0")
1711         r0_full = Signal()
1712
1713         r1 = RegStage1("r1")
1714
1715         reservation = Reservation("rsrv")
1716
1717         # Async signals on incoming request
1718         req_index    = Signal(INDEX_BITS)
1719         req_row      = Signal(ROW_BITS)
1720         req_hit_way  = Signal(WAY_BITS)
1721         req_tag      = Signal(TAG_BITS)
1722         req_op       = Signal(Op)
1723         req_data     = Signal(64)
1724         req_same_tag = Signal()
1725         req_go       = Signal()
1726
1727         early_req_row     = Signal(ROW_BITS)
1728
1729         cancel_store      = Signal()
1730         set_rsrv          = Signal()
1731         clear_rsrv        = Signal()
1732
1733         r0_valid          = Signal()
1734         r0_stall          = Signal()
1735
1736         use_forward1_next = Signal()
1737         use_forward2_next = Signal()
1738
1739         cache_out_row     = Signal(WB_DATA_BITS)
1740
1741         plru_victim       = Signal(WAY_BITS)
1742         replace_way       = Signal(WAY_BITS)
1743
1744         # Wishbone read/write/cache write formatting signals
1745         bus_sel           = Signal(8)
1746
1747         # TLB signals
1748         tlb_way       = TLBRecord("tlb_way")
1749         tlb_req_index = Signal(TLB_SET_BITS)
1750         tlb_hit       = TLBHit("tlb_hit")
1751         pte           = Signal(TLB_PTE_BITS)
1752         ra            = Signal(REAL_ADDR_BITS)
1753         valid_ra      = Signal()
1754         perm_attr     = PermAttr("dc_perms")
1755         rc_ok         = Signal()
1756         perm_ok       = Signal()
1757         access_ok     = Signal()
1758
1759         tlb_plru_victim = Signal(TLB_WAY_BITS)
1760
1761         # we don't yet handle collisions between loadstore1 requests
1762         # and MMU requests
1763         comb += self.m_out.stall.eq(0)
1764
1765         # Hold off the request in r0 when r1 has an uncompleted request
1766         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1767         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1768         comb += self.stall_out.eq(r0_stall)
1769         # debugging: detect if any stall ever requested, which is fine,
1770         # but if a request comes in when stall requested, that's bad.
1771         with m.If(r0_stall):
1772             sync += self.any_stall_out.eq(1)
1773             with m.If(d_in.valid):
1774                 sync += self.dreq_when_stall.eq(1)
1775             with m.If(m_in.valid):
1776                 sync += self.mreq_when_stall.eq(1)
1777
1778         # deal with litex not doing wishbone pipeline mode
1779         # XXX in wrong way.  FIFOs are needed in the SRAM test
1780         # so that stb/ack match up. same thing done in icache.py
1781         if not self.microwatt_compat:
1782             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1783
1784         # Wire up wishbone request latch out of stage 1
1785         comb += self.bus.we.eq(r1.wb.we)
1786         comb += self.bus.adr.eq(r1.wb.adr)
1787         comb += self.bus.sel.eq(r1.wb.sel)
1788         comb += self.bus.stb.eq(r1.wb.stb)
1789         comb += self.bus.dat_w.eq(r1.wb.dat)
1790         comb += self.bus.cyc.eq(r1.wb.cyc)
1791
1792         # create submodule TLBUpdate
1793         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1794
1795         # call sub-functions putting everything together, using shared
1796         # signals established above
1797         self.stage_0(m, r0, r1, r0_full)
1798         self.tlb_read(m, r0_stall, tlb_way)
1799         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1800                         tlb_way,
1801                         pte, tlb_hit, valid_ra, perm_attr, ra)
1802         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1803                         tlb_hit, tlb_plru_victim)
1804         self.maybe_plrus(m, r1, plru_victim)
1805         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1806         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1807         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1808                            r0_valid, r1, cache_tags, replace_way,
1809                            use_forward1_next, use_forward2_next,
1810                            req_hit_way, plru_victim, rc_ok, perm_attr,
1811                            valid_ra, perm_ok, access_ok, req_op, req_go,
1812                            tlb_hit, tlb_way, cache_tag_set,
1813                            cancel_store, req_same_tag, r0_stall, early_req_row)
1814         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1815                            r0_valid, r0, reservation)
1816         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1817                            reservation, r0)
1818         self.writeback_control(m, r1, cache_out_row)
1819         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1820         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1821                         req_hit_way, req_index, req_tag, access_ok,
1822                         tlb_hit, tlb_req_index)
1823         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1824                     r0, replace_way,
1825                     req_hit_way, req_same_tag,
1826                          r0_valid, req_op, cache_tags, req_go, ra)
1827         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1828
1829         return m
1830
1831
1832 if __name__ == '__main__':
1833     dut = DCache()
1834     vl = rtlil.convert(dut, ports=[])
1835     with open("test_dcache.il", "w") as f:
1836         f.write(vl)