src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  17   (discussion about brams for ECP5)
  18
  19 """
  20
  21 import sys
  22
  23 from nmutil.gtkw import write_gtkw
  24
  25 sys.setrecursionlimit(1000000)
  26
  27 from enum import Enum, unique
  28
  29 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  30                     Record, Memory)
  31 from nmutil.util import Display
  32 from nmigen.lib.coding import Decoder
  33
  34 from copy import deepcopy
  35 from random import randint, seed
  36
  37 from nmigen_soc.wishbone.bus import Interface
  38
  39 from nmigen.cli import main
  40 from nmutil.iocontrol import RecordObject
  41 from nmigen.utils import log2_int
  42 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  43                                      DCacheToLoadStore1Type,
  44                                      MMUToDCacheType,
  45                                      DCacheToMMUType)
  46
  47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  48                                 WBAddrType, WBDataType, WBSelType,
  49                                 WBMasterOut, WBSlaveOut,
  50                                 WBMasterOutVector, WBSlaveOutVector,
  51                                 WBIOMasterOut, WBIOSlaveOut)
  52
  53 from soc.experiment.cache_ram import CacheRam
  54 from soc.experiment.plru import PLRU, PLRUs
  55 #from nmutil.plru import PLRU, PLRUs
  56
  57 # for test
  58 from soc.bus.sram import SRAM
  59 from nmigen import Memory
  60 from nmigen.cli import rtlil
  61
  62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  64 from nmutil.sim_tmp_alternative import Simulator
  65
  66 from nmutil.util import wrap
  67
  68
  69 # TODO: make these parameters of DCache at some point
  70 LINE_SIZE = 64    # Line size in bytes
  71 NUM_LINES = 32    # Number of lines in a set
  72 NUM_WAYS = 4      # Number of ways
  73 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  74 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  75 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  76 LOG_LENGTH = 0    # Non-zero to enable log data collection
  77
  78 # BRAM organisation: We never access more than
  79 #     -- WB_DATA_BITS at a time so to save
  80 #     -- resources we make the array only that wide, and
  81 #     -- use consecutive indices to make a cache "line"
  82 #     --
  83 #     -- ROW_SIZE is the width in bytes of the BRAM
  84 #     -- (based on WB, so 64-bits)
  85 ROW_SIZE = WB_DATA_BITS // 8;
  86
  87 # ROW_PER_LINE is the number of row (wishbone
  88 # transactions) in a line
  89 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  90
  91 # BRAM_ROWS is the number of rows in BRAM needed
  92 # to represent the full dcache
  93 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  94
  95 print ("ROW_SIZE", ROW_SIZE)
  96 print ("ROW_PER_LINE", ROW_PER_LINE)
  97 print ("BRAM_ROWS", BRAM_ROWS)
  98 print ("NUM_WAYS", NUM_WAYS)
  99
 100 # Bit fields counts in the address
 101
 102 # REAL_ADDR_BITS is the number of real address
 103 # bits that we store
 104 REAL_ADDR_BITS = 56
 105
 106 # ROW_BITS is the number of bits to select a row
 107 ROW_BITS = log2_int(BRAM_ROWS)
 108
 109 # ROW_LINE_BITS is the number of bits to select
 110 # a row within a line
 111 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 112
 113 # LINE_OFF_BITS is the number of bits for
 114 # the offset in a cache line
 115 LINE_OFF_BITS = log2_int(LINE_SIZE)
 116
 117 # ROW_OFF_BITS is the number of bits for
 118 # the offset in a row
 119 ROW_OFF_BITS = log2_int(ROW_SIZE)
 120
 121 # INDEX_BITS is the number if bits to
 122 # select a cache line
 123 INDEX_BITS = log2_int(NUM_LINES)
 124
 125 # SET_SIZE_BITS is the log base 2 of the set size
 126 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 127
 128 # TAG_BITS is the number of bits of
 129 # the tag part of the address
 130 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 131
 132 # TAG_WIDTH is the width in bits of each way of the tag RAM
 133 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 134
 135 # WAY_BITS is the number of bits to select a way
 136 WAY_BITS = log2_int(NUM_WAYS)
 137
 138 # Example of layout for 32 lines of 64 bytes:
 139 layout = f"""\
 140   DCache Layout:
 141  |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
 142   ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
 143   ..  tag    |index|  line  |
 144   ..         |   row   |    |
 145   ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
 146   ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
 147   ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
 148   ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
 149   ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
 150   .. --------|              | TAG_BITS      ({TAG_BITS})
 151 """
 152 print (layout)
 153 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 154             (TAG_BITS, INDEX_BITS, ROW_BITS,
 155              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 156 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 157 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 158 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 159
 160 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 161
 162 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 163 print ("    TAG_WIDTH", TAG_WIDTH)
 164 print ("     NUM_WAYS", NUM_WAYS)
 165 print ("    NUM_LINES", NUM_LINES)
 166
 167
 168 def CacheTag(name=None):
 169     tag_layout = [('valid', NUM_WAYS),
 170                   ('tag', TAG_RAM_WIDTH),
 171                  ]
 172     return Record(tag_layout, name=name)
 173
 174
 175 def CacheTagArray():
 176     return Array(CacheTag(name="tag%d" % x) for x in range(NUM_LINES))
 177
 178
 179 def RowPerLineValidArray():
 180     return Array(Signal(name="rows_valid%d" % x) \
 181                         for x in range(ROW_PER_LINE))
 182
 183
 184 # L1 TLB
 185 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 186 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 187 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 188 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 189 TLB_PTE_BITS     = 64
 190 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 191
 192 def ispow2(x):
 193     return (1<<log2_int(x, False)) == x
 194
 195 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 196 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 197 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 198 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 199 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 200 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 201         "geometry bits don't add up"
 202 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 203         "geometry bits don't add up"
 204 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 205          "geometry bits don't add up"
 206 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 207 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 208
 209
 210 def TLBHit(name):
 211     return Record([('valid', 1),
 212                    ('way', TLB_WAY_BITS)], name=name)
 213
 214 def TLBTagEAArray():
 215     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 216                 for x in range (TLB_NUM_WAYS))
 217
 218 def TLBRecord(name):
 219     tlb_layout = [('valid', TLB_NUM_WAYS),
 220                   ('tag', TLB_TAG_WAY_BITS),
 221                   ('pte', TLB_PTE_WAY_BITS)
 222                  ]
 223     return Record(tlb_layout, name=name)
 224
 225 def TLBValidArray():
 226     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 227                         for x in range(TLB_SET_SIZE))
 228
 229 def HitWaySet():
 230     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 231                         for x in range(TLB_NUM_WAYS))
 232
 233 # Cache RAM interface
 234 def CacheRamOut():
 235     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 236                  for x in range(NUM_WAYS))
 237
 238 # PLRU output interface
 239 def PLRUOut():
 240     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 241                 for x in range(NUM_LINES))
 242
 243 # TLB PLRU output interface
 244 def TLBPLRUOut():
 245     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 246                 for x in range(TLB_SET_SIZE))
 247
 248 # Helper functions to decode incoming requests
 249 #
 250 # Return the cache line index (tag index) for an address
 251 def get_index(addr):
 252     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 253
 254 # Return the cache row index (data memory) for an address
 255 def get_row(addr):
 256     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 257
 258 # Return the index of a row within a line
 259 def get_row_of_line(row):
 260     return row[:ROW_BITS][:ROW_LINE_BITS]
 261
 262 # Returns whether this is the last row of a line
 263 def is_last_row_addr(addr, last):
 264     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 265
 266 # Returns whether this is the last row of a line
 267 def is_last_row(row, last):
 268     return get_row_of_line(row) == last
 269
 270 # Return the next row in the current cache line. We use a
 271 # dedicated function in order to limit the size of the
 272 # generated adder to be only the bits within a cache line
 273 # (3 bits with default settings)
 274 def next_row(row):
 275     row_v = row[0:ROW_LINE_BITS] + 1
 276     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 277
 278 # Get the tag value from the address
 279 def get_tag(addr):
 280     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 281
 282 # Read a tag from a tag memory row
 283 def read_tag(way, tagset):
 284     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 285
 286 # Read a TLB tag from a TLB tag memory row
 287 def read_tlb_tag(way, tags):
 288     return tags.word_select(way, TLB_EA_TAG_BITS)
 289
 290 # Write a TLB tag to a TLB tag memory row
 291 def write_tlb_tag(way, tags, tag):
 292     return read_tlb_tag(way, tags).eq(tag)
 293
 294 # Read a PTE from a TLB PTE memory row
 295 def read_tlb_pte(way, ptes):
 296     return ptes.word_select(way, TLB_PTE_BITS)
 297
 298 def write_tlb_pte(way, ptes, newpte):
 299     return read_tlb_pte(way, ptes).eq(newpte)
 300
 301
 302 # Record for storing permission, attribute, etc. bits from a PTE
 303 class PermAttr(RecordObject):
 304     def __init__(self, name=None):
 305         super().__init__(name=name)
 306         self.reference = Signal()
 307         self.changed   = Signal()
 308         self.nocache   = Signal()
 309         self.priv      = Signal()
 310         self.rd_perm   = Signal()
 311         self.wr_perm   = Signal()
 312
 313
 314 def extract_perm_attr(pte):
 315     pa = PermAttr()
 316     return pa;
 317
 318
 319 # Type of operation on a "valid" input
 320 @unique
 321 class Op(Enum):
 322     OP_NONE       = 0
 323     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 324     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 325     OP_LOAD_HIT   = 3 # Cache hit on load
 326     OP_LOAD_MISS  = 4 # Load missing cache
 327     OP_LOAD_NC    = 5 # Non-cachable load
 328     OP_STORE_HIT  = 6 # Store hitting cache
 329     OP_STORE_MISS = 7 # Store missing cache
 330
 331
 332 # Cache state machine
 333 @unique
 334 class State(Enum):
 335     IDLE             = 0 # Normal load hit processing
 336     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 337     STORE_WAIT_ACK   = 2 # Store wait ack
 338     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 339
 340
 341 # Dcache operations:
 342 #
 343 # In order to make timing, we use the BRAMs with
 344 # an output buffer, which means that the BRAM
 345 # output is delayed by an extra cycle.
 346 #
 347 # Thus, the dcache has a 2-stage internal pipeline
 348 # for cache hits with no stalls.
 349 #
 350 # All other operations are handled via stalling
 351 # in the first stage.
 352 #
 353 # The second stage can thus complete a hit at the same
 354 # time as the first stage emits a stall for a complex op.
 355 #
 356 # Stage 0 register, basically contains just the latched request
 357
 358 class RegStage0(RecordObject):
 359     def __init__(self, name=None):
 360         super().__init__(name=name)
 361         self.req     = LoadStore1ToDCacheType(name="lsmem")
 362         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 363         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 364         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 365         self.mmu_req = Signal() # indicates source of request
 366         self.d_valid = Signal() # indicates req.data is valid now
 367
 368
 369 class MemAccessRequest(RecordObject):
 370     def __init__(self, name=None):
 371         super().__init__(name=name)
 372         self.op        = Signal(Op)
 373         self.valid     = Signal()
 374         self.dcbz      = Signal()
 375         self.real_addr = Signal(REAL_ADDR_BITS)
 376         self.data      = Signal(64)
 377         self.byte_sel  = Signal(8)
 378         self.hit_way   = Signal(WAY_BITS)
 379         self.same_tag  = Signal()
 380         self.mmu_req   = Signal()
 381
 382
 383 # First stage register, contains state for stage 1 of load hits
 384 # and for the state machine used by all other operations
 385 class RegStage1(RecordObject):
 386     def __init__(self, name=None):
 387         super().__init__(name=name)
 388         # Info about the request
 389         self.full             = Signal() # have uncompleted request
 390         self.mmu_req          = Signal() # request is from MMU
 391         self.req              = MemAccessRequest(name="reqmem")
 392
 393         # Cache hit state
 394         self.hit_way          = Signal(WAY_BITS)
 395         self.hit_load_valid   = Signal()
 396         self.hit_index        = Signal(INDEX_BITS)
 397         self.cache_hit        = Signal()
 398
 399         # TLB hit state
 400         self.tlb_hit          = TLBHit("tlb_hit")
 401         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 402
 403         # 2-stage data buffer for data forwarded from writes to reads
 404         self.forward_data1    = Signal(64)
 405         self.forward_data2    = Signal(64)
 406         self.forward_sel1     = Signal(8)
 407         self.forward_valid1   = Signal()
 408         self.forward_way1     = Signal(WAY_BITS)
 409         self.forward_row1     = Signal(ROW_BITS)
 410         self.use_forward1     = Signal()
 411         self.forward_sel      = Signal(8)
 412
 413         # Cache miss state (reload state machine)
 414         self.state            = Signal(State)
 415         self.dcbz             = Signal()
 416         self.write_bram       = Signal()
 417         self.write_tag        = Signal()
 418         self.slow_valid       = Signal()
 419         self.wb               = WBMasterOut("wb")
 420         self.reload_tag       = Signal(TAG_BITS)
 421         self.store_way        = Signal(WAY_BITS)
 422         self.store_row        = Signal(ROW_BITS)
 423         self.store_index      = Signal(INDEX_BITS)
 424         self.end_row_ix       = Signal(ROW_LINE_BITS)
 425         self.rows_valid       = RowPerLineValidArray()
 426         self.acks_pending     = Signal(3)
 427         self.inc_acks         = Signal()
 428         self.dec_acks         = Signal()
 429
 430         # Signals to complete (possibly with error)
 431         self.ls_valid         = Signal()
 432         self.ls_error         = Signal()
 433         self.mmu_done         = Signal()
 434         self.mmu_error        = Signal()
 435         self.cache_paradox    = Signal()
 436
 437         # Signal to complete a failed stcx.
 438         self.stcx_fail        = Signal()
 439
 440
 441 # Reservation information
 442 class Reservation(RecordObject):
 443     def __init__(self):
 444         super().__init__()
 445         self.valid = Signal()
 446         self.addr  = Signal(64-LINE_OFF_BITS)
 447
 448
 449 class DTLBUpdate(Elaboratable):
 450     def __init__(self):
 451         self.tlbie    = Signal()
 452         self.tlbwe    = Signal()
 453         self.doall    = Signal()
 454         self.tlb_hit     = TLBHit("tlb_hit")
 455         self.tlb_req_index = Signal(TLB_SET_BITS)
 456
 457         self.repl_way        = Signal(TLB_WAY_BITS)
 458         self.eatag           = Signal(TLB_EA_TAG_BITS)
 459         self.pte_data        = Signal(TLB_PTE_BITS)
 460
 461         # read from dtlb array
 462         self.tlb_read       = Signal()
 463         self.tlb_read_index = Signal(TLB_SET_BITS)
 464         self.tlb_way        = TLBRecord("o_tlb_way")
 465
 466     def elaborate(self, platform):
 467         m = Module()
 468         comb = m.d.comb
 469         sync = m.d.sync
 470
 471         # there are 3 parts to this:
 472         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 473         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 474         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 475         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 476         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 477         # hmmm....
 478
 479         dtlb_valid = TLBValidArray()
 480         tlb_req_index = self.tlb_req_index
 481
 482         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 483         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 484         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 485         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 486         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 487         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 488
 489         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 490         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 491         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 492         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 493                                     granularity=TLB_EA_TAG_BITS)
 494
 495         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 496         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 497         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 498                                     granularity=TLB_PTE_BITS)
 499
 500         # commented out for now, can be put in if Memory.reset can be
 501         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 502         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 503         #m.submodules.rd_valid = rd_valid = validm.read_port()
 504         #m.submodules.wr_valid = wr_valid = validm.write_port(
 505                                     #granularity=1)
 506
 507         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 508         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 509         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 510         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 511         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 512         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 513         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 514
 515         updated  = Signal()
 516         v_updated  = Signal()
 517         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 518         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 519         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 520         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 521
 522         comb += dv.eq(dtlb_valid[tlb_req_index])
 523         comb += db_out.eq(dv)
 524
 525         with m.If(self.tlbie & self.doall):
 526             # clear all valid bits at once
 527             # XXX hmmm, validm _could_ use Memory reset here...
 528             for i in range(TLB_SET_SIZE):
 529                 sync += dtlb_valid[i].eq(0)
 530         with m.Elif(self.tlbie):
 531             # invalidate just the hit_way
 532             with m.If(self.tlb_hit.valid):
 533                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 534                 comb += v_updated.eq(1)
 535         with m.Elif(self.tlbwe):
 536             # write to the requested tag and PTE
 537             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 538             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 539             # set valid bit
 540             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 541
 542             comb += updated.eq(1)
 543             comb += v_updated.eq(1)
 544
 545         # above, sometimes valid is requested to be updated but data not
 546         # therefore split them out, here.  note the granularity thing matches
 547         # with the shift-up of the eatag/pte_data into the correct TLB way.
 548         # thus is it not necessary to write the entire lot, just the portion
 549         # being altered: hence writing the *old* copy of the row is not needed
 550         with m.If(updated): # PTE and TAG to be written
 551             comb += wr_pteway.data.eq(pb_out)
 552             comb += wr_pteway.en.eq(1<<self.repl_way)
 553             comb += wr_tagway.data.eq(tb_out)
 554             comb += wr_tagway.en.eq(1<<self.repl_way)
 555         with m.If(v_updated): # Valid to be written
 556             sync += dtlb_valid[tlb_req_index].eq(db_out)
 557             #comb += wr_valid.data.eq(db_out)
 558             #comb += wr_valid.en.eq(1<<self.repl_way)
 559
 560         # select one TLB way, use a register here
 561         r_tlb_way        = TLBRecord("r_tlb_way")
 562         r_delay = Signal()
 563         sync += r_delay.eq(self.tlb_read)
 564         with m.If(self.tlb_read):
 565             sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 566         with m.If(r_delay):
 567             # on one clock delay, output the contents of the read port(s)
 568             # comb += self.tlb_way.valid.eq(rd_valid.data)
 569             comb += self.tlb_way.tag.eq(rd_tagway.data)
 570             comb += self.tlb_way.pte.eq(rd_pteway.data)
 571             # and also capture the (delayed) output...
 572             #sync += r_tlb_way.valid.eq(rd_valid.data)
 573             sync += r_tlb_way.tag.eq(rd_tagway.data)
 574             sync += r_tlb_way.pte.eq(rd_pteway.data)
 575         with m.Else():
 576             # ... so that the register can output it when no read is requested
 577             # it's rather overkill but better to be safe than sorry
 578             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 579             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 580             #comb += self.tlb_way.eq(r_tlb_way)
 581
 582         return m
 583
 584
 585 class DCachePendingHit(Elaboratable):
 586
 587     def __init__(self, tlb_way,
 588                       cache_i_validdx, cache_tag_set,
 589                     req_addr):
 590
 591         self.go          = Signal()
 592         self.virt_mode   = Signal()
 593         self.is_hit      = Signal()
 594         self.tlb_hit      = TLBHit("tlb_hit")
 595         self.hit_way     = Signal(WAY_BITS)
 596         self.rel_match   = Signal()
 597         self.req_index   = Signal(INDEX_BITS)
 598         self.reload_tag  = Signal(TAG_BITS)
 599
 600         self.tlb_way = tlb_way
 601         self.cache_i_validdx = cache_i_validdx
 602         self.cache_tag_set = cache_tag_set
 603         self.req_addr = req_addr
 604
 605     def elaborate(self, platform):
 606         m = Module()
 607         comb = m.d.comb
 608         sync = m.d.sync
 609
 610         go = self.go
 611         virt_mode = self.virt_mode
 612         is_hit = self.is_hit
 613         tlb_way = self.tlb_way
 614         cache_i_validdx = self.cache_i_validdx
 615         cache_tag_set = self.cache_tag_set
 616         req_addr = self.req_addr
 617         tlb_hit = self.tlb_hit
 618         hit_way = self.hit_way
 619         rel_match = self.rel_match
 620         req_index = self.req_index
 621         reload_tag = self.reload_tag
 622
 623         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 624                                   for i in range(TLB_NUM_WAYS))
 625         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 626                                     for i in range(TLB_NUM_WAYS))
 627         hit_way_set = HitWaySet()
 628
 629         # Test if pending request is a hit on any way
 630         # In order to make timing in virtual mode,
 631         # when we are using the TLB, we compare each
 632         # way with each of the real addresses from each way of
 633         # the TLB, and then decide later which match to use.
 634
 635         with m.If(virt_mode):
 636             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 637                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 638                 s_hit       = Signal(name="s_hit%d" % j)
 639                 s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
 640                 s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
 641                 # read the PTE, calc the Real Address, get tge tag
 642                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 643                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 644                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 645                 comb += s_tag.eq(get_tag(s_ra))
 646                 # for each way check tge tag against the cache tag set
 647                 for i in range(NUM_WAYS): # way_t
 648                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 649                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 650                                   (read_tag(i, cache_tag_set) == s_tag)
 651                                   & (tlb_way.valid[j]))
 652                     with m.If(is_tag_hit):
 653                         comb += hit_way_set[j].eq(i)
 654                         comb += s_hit.eq(1)
 655                 comb += hit_set[j].eq(s_hit)
 656                 comb += rel_matches[j].eq(s_tag == reload_tag)
 657             with m.If(tlb_hit.valid):
 658                 comb += is_hit.eq(hit_set[tlb_hit.way])
 659                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 660                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 661         with m.Else():
 662             s_tag       = Signal(TAG_BITS)
 663             comb += s_tag.eq(get_tag(req_addr))
 664             for i in range(NUM_WAYS): # way_t
 665                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 666                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 667                           (read_tag(i, cache_tag_set) == s_tag))
 668                 with m.If(is_tag_hit):
 669                     comb += hit_way.eq(i)
 670                     comb += is_hit.eq(1)
 671             with m.If(s_tag == reload_tag):
 672                 comb += rel_match.eq(1)
 673
 674         return m
 675
 676
 677 class DCache(Elaboratable):
 678     """Set associative dcache write-through
 679
 680     TODO (in no specific order):
 681     * See list in icache.vhdl
 682     * Complete load misses on the cycle when WB data comes instead of
 683       at the end of line (this requires dealing with requests coming in
 684       while not idle...)
 685     """
 686     def __init__(self, pspec=None):
 687         self.d_in      = LoadStore1ToDCacheType("d_in")
 688         self.d_out     = DCacheToLoadStore1Type("d_out")
 689
 690         self.m_in      = MMUToDCacheType("m_in")
 691         self.m_out     = DCacheToMMUType("m_out")
 692
 693         self.stall_out = Signal()
 694
 695         # standard naming (wired to non-standard for compatibility)
 696         self.bus = Interface(addr_width=32,
 697                             data_width=64,
 698                             granularity=8,
 699                             features={'stall'},
 700                             alignment=0,
 701                             name="dcache")
 702
 703         self.log_out   = Signal(20)
 704
 705         # test if microwatt compatibility is to be enabled
 706         self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
 707                                  (pspec.microwatt_compat == True))
 708
 709     def stage_0(self, m, r0, r1, r0_full):
 710         """Latch the request in r0.req as long as we're not stalling
 711         """
 712         comb = m.d.comb
 713         sync = m.d.sync
 714         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 715
 716         r = RegStage0("stage0")
 717
 718         # TODO, this goes in unit tests and formal proofs
 719         with m.If(d_in.valid & m_in.valid):
 720             sync += Display("request collision loadstore vs MMU")
 721
 722         with m.If(m_in.valid):
 723             comb += r.req.valid.eq(1)
 724             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 725             comb += r.req.dcbz.eq(0)
 726             comb += r.req.nc.eq(0)
 727             comb += r.req.reserve.eq(0)
 728             comb += r.req.virt_mode.eq(0)
 729             comb += r.req.priv_mode.eq(1)
 730             comb += r.req.addr.eq(m_in.addr)
 731             comb += r.req.data.eq(m_in.pte)
 732             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 733             comb += r.tlbie.eq(m_in.tlbie)
 734             comb += r.doall.eq(m_in.doall)
 735             comb += r.tlbld.eq(m_in.tlbld)
 736             comb += r.mmu_req.eq(1)
 737             comb += r.d_valid.eq(1)
 738             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 739                                  m_in.addr, m_in.pte, r.req.load)
 740
 741         with m.Else():
 742             comb += r.req.eq(d_in)
 743             comb += r.req.data.eq(0)
 744             comb += r.tlbie.eq(0)
 745             comb += r.doall.eq(0)
 746             comb += r.tlbld.eq(0)
 747             comb += r.mmu_req.eq(0)
 748             comb += r.d_valid.eq(0)
 749
 750         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 751             sync += r0.eq(r)
 752             sync += r0_full.eq(r.req.valid)
 753         with m.Elif(~r0.d_valid):
 754             # Sample data the cycle after a request comes in from loadstore1.
 755             # If another request has come in already then the data will get
 756             # put directly into req.data below.
 757             sync += r0.req.data.eq(d_in.data)
 758             sync += r0.d_valid.eq(1)
 759         with m.If(d_in.valid):
 760             m.d.sync += Display("    DCACHE req cache "
 761                                 "virt %d addr %x data %x ld %d",
 762                                  r.req.virt_mode, r.req.addr,
 763                                  r.req.data, r.req.load)
 764
 765     def tlb_read(self, m, r0_stall, tlb_way):
 766         """TLB
 767         Operates in the second cycle on the request latched in r0.req.
 768         TLB updates write the entry at the end of the second cycle.
 769         """
 770         comb = m.d.comb
 771         sync = m.d.sync
 772         m_in, d_in = self.m_in, self.d_in
 773
 774         addrbits = Signal(TLB_SET_BITS)
 775
 776         amin = TLB_LG_PGSZ
 777         amax = TLB_LG_PGSZ + TLB_SET_BITS
 778
 779         with m.If(m_in.valid):
 780             comb += addrbits.eq(m_in.addr[amin : amax])
 781         with m.Else():
 782             comb += addrbits.eq(d_in.addr[amin : amax])
 783
 784         # If we have any op and the previous op isn't finished,
 785         # then keep the same output for next cycle.
 786         d = self.dtlb_update
 787         comb += d.tlb_read_index.eq(addrbits)
 788         comb += d.tlb_read.eq(~r0_stall)
 789         comb += tlb_way.eq(d.tlb_way)
 790
 791     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 792         """Generate TLB PLRUs
 793         """
 794         comb = m.d.comb
 795         sync = m.d.sync
 796
 797         if TLB_NUM_WAYS == 0:
 798             return
 799
 800         # suite of PLRUs with a selection and output mechanism
 801         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 802         m.submodules.tlb_plrus = tlb_plrus
 803         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 804         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 805         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 806         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 807         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 808
 809     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 810                    tlb_way,
 811                    pte, tlb_hit, valid_ra, perm_attr, ra):
 812
 813         comb = m.d.comb
 814
 815         hitway = Signal(TLB_WAY_BITS)
 816         hit    = Signal()
 817         eatag  = Signal(TLB_EA_TAG_BITS)
 818
 819         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 820         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 821         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 822
 823         for i in range(TLB_NUM_WAYS):
 824             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 825             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 826             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 827             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 828             with m.If(is_tag_hit):
 829                 comb += hitway.eq(i)
 830                 comb += hit.eq(1)
 831
 832         comb += tlb_hit.valid.eq(hit & r0_valid)
 833         comb += tlb_hit.way.eq(hitway)
 834
 835         with m.If(tlb_hit.valid):
 836             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 837         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 838
 839         with m.If(r0.req.virt_mode):
 840             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 841                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 842                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 843             comb += perm_attr.reference.eq(pte[8])
 844             comb += perm_attr.changed.eq(pte[7])
 845             comb += perm_attr.nocache.eq(pte[5])
 846             comb += perm_attr.priv.eq(pte[3])
 847             comb += perm_attr.rd_perm.eq(pte[2])
 848             comb += perm_attr.wr_perm.eq(pte[1])
 849         with m.Else():
 850             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 851                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 852             comb += perm_attr.reference.eq(1)
 853             comb += perm_attr.changed.eq(1)
 854             comb += perm_attr.nocache.eq(0)
 855             comb += perm_attr.priv.eq(1)
 856             comb += perm_attr.rd_perm.eq(1)
 857             comb += perm_attr.wr_perm.eq(1)
 858
 859         with m.If(valid_ra):
 860             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 861                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 862             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 863             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 864             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 865             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 866             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 867             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 868
 869     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 870                     tlb_hit, tlb_plru_victim):
 871
 872         comb = m.d.comb
 873         sync = m.d.sync
 874
 875         tlbie    = Signal()
 876         tlbwe    = Signal()
 877
 878         comb += tlbie.eq(r0_valid & r0.tlbie)
 879         comb += tlbwe.eq(r0_valid & r0.tlbld)
 880
 881         d = self.dtlb_update
 882
 883         comb += d.tlbie.eq(tlbie)
 884         comb += d.tlbwe.eq(tlbwe)
 885         comb += d.doall.eq(r0.doall)
 886         comb += d.tlb_hit.eq(tlb_hit)
 887         comb += d.tlb_req_index.eq(tlb_req_index)
 888
 889         with m.If(tlb_hit.valid):
 890             comb += d.repl_way.eq(tlb_hit.way)
 891         with m.Else():
 892             comb += d.repl_way.eq(tlb_plru_victim)
 893         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 894         comb += d.pte_data.eq(r0.req.data)
 895
 896     def maybe_plrus(self, m, r1, plru_victim):
 897         """Generate PLRUs
 898         """
 899         comb = m.d.comb
 900         sync = m.d.sync
 901
 902         if TLB_NUM_WAYS == 0:
 903             return
 904
 905         # suite of PLRUs with a selection and output mechanism
 906         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 907         comb += plrus.way.eq(r1.hit_way)
 908         comb += plrus.valid.eq(r1.cache_hit)
 909         comb += plrus.index.eq(r1.hit_index)
 910         comb += plrus.isel.eq(r1.store_index) # select victim
 911         comb += plru_victim.eq(plrus.o_index) # selected victim
 912
 913     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 914         """Cache tag RAM read port
 915         """
 916         comb = m.d.comb
 917         sync = m.d.sync
 918         m_in, d_in = self.m_in, self.d_in
 919
 920         index = Signal(INDEX_BITS)
 921
 922         with m.If(r0_stall):
 923             comb += index.eq(req_index)
 924         with m.Elif(m_in.valid):
 925             comb += index.eq(get_index(m_in.addr))
 926         with m.Else():
 927             comb += index.eq(get_index(d_in.addr))
 928         sync += cache_tag_set.eq(cache_tags[index].tag)
 929
 930     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 931                        r0_valid, r1, cache_tags, replace_way,
 932                        use_forward1_next, use_forward2_next,
 933                        req_hit_way, plru_victim, rc_ok, perm_attr,
 934                        valid_ra, perm_ok, access_ok, req_op, req_go,
 935                        tlb_hit, tlb_way, cache_tag_set,
 936                        cancel_store, req_same_tag, r0_stall, early_req_row):
 937         """Cache request parsing and hit detection
 938         """
 939
 940         comb = m.d.comb
 941         m_in, d_in = self.m_in, self.d_in
 942
 943         is_hit      = Signal()
 944         hit_way     = Signal(WAY_BITS)
 945         op          = Signal(Op)
 946         opsel       = Signal(3)
 947         go          = Signal()
 948         nc          = Signal()
 949         cache_i_validdx = Signal(NUM_WAYS)
 950
 951         # Extract line, row and tag from request
 952         comb += req_index.eq(get_index(r0.req.addr))
 953         comb += req_row.eq(get_row(r0.req.addr))
 954         comb += req_tag.eq(get_tag(ra))
 955
 956         if False: # display on comb is a bit... busy.
 957             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 958                     r0.req.addr, ra, req_index, req_tag, req_row)
 959
 960         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 961         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 962
 963         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 964                                             cache_i_validdx, cache_tag_set,
 965                                             r0.req.addr)
 966         comb += dc.tlb_hit.eq(tlb_hit)
 967         comb += dc.reload_tag.eq(r1.reload_tag)
 968         comb += dc.virt_mode.eq(r0.req.virt_mode)
 969         comb += dc.go.eq(go)
 970         comb += dc.req_index.eq(req_index)
 971
 972         comb += is_hit.eq(dc.is_hit)
 973         comb += hit_way.eq(dc.hit_way)
 974         comb += req_same_tag.eq(dc.rel_match)
 975
 976         # See if the request matches the line currently being reloaded
 977         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 978                   (req_index == r1.store_index) & req_same_tag):
 979             # For a store, consider this a hit even if the row isn't
 980             # valid since it will be by the time we perform the store.
 981             # For a load, check the appropriate row valid bit.
 982             rrow = Signal(ROW_LINE_BITS)
 983             comb += rrow.eq(req_row)
 984             valid = r1.rows_valid[rrow]
 985             comb += is_hit.eq((~r0.req.load) | valid)
 986             comb += hit_way.eq(replace_way)
 987
 988         # Whether to use forwarded data for a load or not
 989         with m.If((get_row(r1.req.real_addr) == req_row) &
 990                   (r1.req.hit_way == hit_way)):
 991             # Only need to consider r1.write_bram here, since if we
 992             # are writing refill data here, then we don't have a
 993             # cache hit this cycle on the line being refilled.
 994             # (There is the possibility that the load following the
 995             # load miss that started the refill could be to the old
 996             # contents of the victim line, since it is a couple of
 997             # cycles after the refill starts before we see the updated
 998             # cache tag. In that case we don't use the bypass.)
 999             comb += use_forward1_next.eq(r1.write_bram)
1000         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
1001             comb += use_forward2_next.eq(r1.forward_valid1)
1002
1003         # The way that matched on a hit
1004         comb += req_hit_way.eq(hit_way)
1005
1006         # The way to replace on a miss
1007         with m.If(r1.write_tag):
1008             comb += replace_way.eq(plru_victim)
1009         with m.Else():
1010             comb += replace_way.eq(r1.store_way)
1011
1012         # work out whether we have permission for this access
1013         # NB we don't yet implement AMR, thus no KUAP
1014         comb += rc_ok.eq(perm_attr.reference
1015                          & (r0.req.load | perm_attr.changed))
1016         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1017                            (perm_attr.wr_perm |
1018                               (r0.req.load & perm_attr.rd_perm)))
1019         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1020
1021         # Combine the request and cache hit status to decide what
1022         # operation needs to be done
1023         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1024         comb += op.eq(Op.OP_NONE)
1025         with m.If(go):
1026             with m.If(~access_ok):
1027                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1028                                  valid_ra, perm_ok, rc_ok)
1029                 comb += op.eq(Op.OP_BAD)
1030             with m.Elif(cancel_store):
1031                 m.d.sync += Display("DCACHE cancel store")
1032                 comb += op.eq(Op.OP_STCX_FAIL)
1033             with m.Else():
1034                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1035                                  valid_ra, nc, r0.req.load)
1036                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1037                 with m.Switch(opsel):
1038                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1039                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1040                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1041                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1042                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1043                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1044                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1045                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1046         comb += req_op.eq(op)
1047         comb += req_go.eq(go)
1048
1049         # Version of the row number that is valid one cycle earlier
1050         # in the cases where we need to read the cache data BRAM.
1051         # If we're stalling then we need to keep reading the last
1052         # row requested.
1053         with m.If(~r0_stall):
1054             with m.If(m_in.valid):
1055                 comb += early_req_row.eq(get_row(m_in.addr))
1056             with m.Else():
1057                 comb += early_req_row.eq(get_row(d_in.addr))
1058         with m.Else():
1059             comb += early_req_row.eq(req_row)
1060
1061     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1062                          r0_valid, r0, reservation):
1063         """Handle load-with-reservation and store-conditional instructions
1064         """
1065         comb = m.d.comb
1066
1067         with m.If(r0_valid & r0.req.reserve):
1068             # XXX generate alignment interrupt if address
1069             # is not aligned XXX or if r0.req.nc = '1'
1070             with m.If(r0.req.load):
1071                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1072             with m.Else():
1073                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1074                 with m.If((~reservation.valid) |
1075                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1076                     comb += cancel_store.eq(1)
1077
1078     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1079                         reservation, r0):
1080         comb = m.d.comb
1081         sync = m.d.sync
1082
1083         with m.If(r0_valid & access_ok):
1084             with m.If(clear_rsrv):
1085                 sync += reservation.valid.eq(0)
1086             with m.Elif(set_rsrv):
1087                 sync += reservation.valid.eq(1)
1088                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1089
1090     def writeback_control(self, m, r1, cache_out_row):
1091         """Return data for loads & completion control logic
1092         """
1093         comb = m.d.comb
1094         sync = m.d.sync
1095         d_out, m_out = self.d_out, self.m_out
1096
1097         data_out = Signal(64)
1098         data_fwd = Signal(64)
1099
1100         # Use the bypass if are reading the row that was
1101         # written 1 or 2 cycles ago, including for the
1102         # slow_valid = 1 case (i.e. completing a load
1103         # miss or a non-cacheable load).
1104         with m.If(r1.use_forward1):
1105             comb += data_fwd.eq(r1.forward_data1)
1106         with m.Else():
1107             comb += data_fwd.eq(r1.forward_data2)
1108
1109         comb += data_out.eq(cache_out_row)
1110
1111         for i in range(8):
1112             with m.If(r1.forward_sel[i]):
1113                 dsel = data_fwd.word_select(i, 8)
1114                 comb += data_out.word_select(i, 8).eq(dsel)
1115
1116         # DCache output to LoadStore
1117         comb += d_out.valid.eq(r1.ls_valid)
1118         comb += d_out.data.eq(data_out)
1119         comb += d_out.store_done.eq(~r1.stcx_fail)
1120         comb += d_out.error.eq(r1.ls_error)
1121         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1122
1123         # Outputs to MMU
1124         comb += m_out.done.eq(r1.mmu_done)
1125         comb += m_out.err.eq(r1.mmu_error)
1126         comb += m_out.data.eq(data_out)
1127
1128         # We have a valid load or store hit or we just completed
1129         # a slow op such as a load miss, a NC load or a store
1130         #
1131         # Note: the load hit is delayed by one cycle. However it
1132         # can still not collide with r.slow_valid (well unless I
1133         # miscalculated) because slow_valid can only be set on a
1134         # subsequent request and not on its first cycle (the state
1135         # machine must have advanced), which makes slow_valid
1136         # at least 2 cycles from the previous hit_load_valid.
1137
1138         # Sanity: Only one of these must be set in any given cycle
1139
1140         if False: # TODO: need Display to get this to work
1141             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1142             "unexpected slow_valid collision with stcx_fail"
1143
1144             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1145              "unexpected hit_load_delayed collision with slow_valid"
1146
1147         with m.If(~r1.mmu_req):
1148             # Request came from loadstore1...
1149             # Load hit case is the standard path
1150             with m.If(r1.hit_load_valid):
1151                 sync += Display("completing load hit data=%x", data_out)
1152
1153             # error cases complete without stalling
1154             with m.If(r1.ls_error):
1155                 with m.If(r1.dcbz):
1156                     sync += Display("completing dcbz with error")
1157                 with m.Else():
1158                     sync += Display("completing ld/st with error")
1159
1160             # Slow ops (load miss, NC, stores)
1161             with m.If(r1.slow_valid):
1162                 sync += Display("completing store or load miss adr=%x data=%x",
1163                                 r1.req.real_addr, data_out)
1164
1165         with m.Else():
1166             # Request came from MMU
1167             with m.If(r1.hit_load_valid):
1168                 sync += Display("completing load hit to MMU, data=%x",
1169                                 m_out.data)
1170             # error cases complete without stalling
1171             with m.If(r1.mmu_error):
1172                 sync += Display("combpleting MMU ld with error")
1173
1174             # Slow ops (i.e. load miss)
1175             with m.If(r1.slow_valid):
1176                 sync += Display("completing MMU load miss, adr=%x data=%x",
1177                                 r1.req.real_addr, m_out.data)
1178
1179     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1180         """rams
1181         Generate a cache RAM for each way. This handles the normal
1182         reads, writes from reloads and the special store-hit update
1183         path as well.
1184
1185         Note: the BRAMs have an extra read buffer, meaning the output
1186         is pipelined an extra cycle. This differs from the
1187         icache. The writeback logic needs to take that into
1188         account by using 1-cycle delayed signals for load hits.
1189         """
1190         comb = m.d.comb
1191         bus = self.bus
1192
1193         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1194         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1195         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1196         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1197                    ~r1.write_bram))
1198         comb += rwe.i.eq(replace_way)
1199
1200         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1201         comb += hwe.i.eq(r1.hit_way)
1202
1203         # this one is gated with write_bram, and replace_way_e can never be
1204         # set at the same time.  that means that do_write can OR the outputs
1205         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1206         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1207         comb += hre.i.eq(r1.req.hit_way)
1208
1209         # common Signals
1210         do_read  = Signal()
1211         wr_addr  = Signal(ROW_BITS)
1212         wr_data  = Signal(WB_DATA_BITS)
1213         wr_sel   = Signal(ROW_SIZE)
1214         rd_addr  = Signal(ROW_BITS)
1215
1216         comb += do_read.eq(1) # always enable
1217         comb += rd_addr.eq(early_req_row)
1218
1219         # Write mux:
1220         #
1221         # Defaults to wishbone read responses (cache refill)
1222         #
1223         # For timing, the mux on wr_data/sel/addr is not
1224         # dependent on anything other than the current state.
1225
1226         with m.If(r1.write_bram):
1227             # Write store data to BRAM.  This happens one
1228             # cycle after the store is in r0.
1229             comb += wr_data.eq(r1.req.data)
1230             comb += wr_sel.eq(r1.req.byte_sel)
1231             comb += wr_addr.eq(get_row(r1.req.real_addr))
1232
1233         with m.Else():
1234             # Otherwise, we might be doing a reload or a DCBZ
1235             with m.If(r1.dcbz):
1236                 comb += wr_data.eq(0)
1237             with m.Else():
1238                 comb += wr_data.eq(bus.dat_r)
1239             comb += wr_addr.eq(r1.store_row)
1240             comb += wr_sel.eq(~0) # all 1s
1241
1242         # set up Cache Rams
1243         for i in range(NUM_WAYS):
1244             do_write = Signal(name="do_wr%d" % i)
1245             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1246             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1247
1248             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1249             m.submodules["cacheram_%d" % i] = way
1250
1251             comb += way.rd_en.eq(do_read)
1252             comb += way.rd_addr.eq(rd_addr)
1253             comb += d_out.eq(way.rd_data_o)
1254             comb += way.wr_sel.eq(wr_sel_m)
1255             comb += way.wr_addr.eq(wr_addr)
1256             comb += way.wr_data.eq(wr_data)
1257
1258             # Cache hit reads
1259             with m.If(hwe.o[i]):
1260                 comb += cache_out_row.eq(d_out)
1261
1262             # these are mutually-exclusive via their Decoder-enablers
1263             # (note: Decoder-enable is inverted)
1264             comb += do_write.eq(hre.o[i] | rwe.o[i])
1265
1266             # Mask write selects with do_write since BRAM
1267             # doesn't have a global write-enable
1268             with m.If(do_write):
1269                 comb += wr_sel_m.eq(wr_sel)
1270
1271     # Cache hit synchronous machine for the easy case.
1272     # This handles load hits.
1273     # It also handles error cases (TLB miss, cache paradox)
1274     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1275                         req_hit_way, req_index, req_tag, access_ok,
1276                         tlb_hit, tlb_req_index):
1277         comb = m.d.comb
1278         sync = m.d.sync
1279
1280         with m.If(req_op != Op.OP_NONE):
1281             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1282                     req_op, r0.req.addr, r0.req.nc,
1283                     req_index, req_tag, req_hit_way)
1284
1285         with m.If(r0_valid):
1286             sync += r1.mmu_req.eq(r0.mmu_req)
1287
1288         # Fast path for load/store hits.
1289         # Set signals for the writeback controls.
1290         sync += r1.hit_way.eq(req_hit_way)
1291         sync += r1.hit_index.eq(req_index)
1292
1293         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1294         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1295                                 (req_op == Op.OP_STORE_HIT))
1296
1297         with m.If(req_op == Op.OP_BAD):
1298             sync += Display("Signalling ld/st error "
1299                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1300                             ~r0.mmu_req,r0.mmu_req,access_ok)
1301             sync += r1.ls_error.eq(~r0.mmu_req)
1302             sync += r1.mmu_error.eq(r0.mmu_req)
1303             sync += r1.cache_paradox.eq(access_ok)
1304         with m.Else():
1305             sync += r1.ls_error.eq(0)
1306             sync += r1.mmu_error.eq(0)
1307             sync += r1.cache_paradox.eq(0)
1308
1309         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1310
1311         # Record TLB hit information for updating TLB PLRU
1312         sync += r1.tlb_hit.eq(tlb_hit)
1313         sync += r1.tlb_hit_index.eq(tlb_req_index)
1314
1315     # Memory accesses are handled by this state machine:
1316     #
1317     #   * Cache load miss/reload (in conjunction with "rams")
1318     #   * Load hits for non-cachable forms
1319     #   * Stores (the collision case is handled in "rams")
1320     #
1321     # All wishbone requests generation is done here.
1322     # This machine operates at stage 1.
1323     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1324                     r0, replace_way,
1325                     req_hit_way, req_same_tag,
1326                     r0_valid, req_op, cache_tags, req_go, ra):
1327
1328         comb = m.d.comb
1329         sync = m.d.sync
1330         bus = self.bus
1331         d_in = self.d_in
1332
1333         req         = MemAccessRequest("mreq_ds")
1334
1335         req_row = Signal(ROW_BITS)
1336         req_idx = Signal(INDEX_BITS)
1337         req_tag = Signal(TAG_BITS)
1338         comb += req_idx.eq(get_index(req.real_addr))
1339         comb += req_row.eq(get_row(req.real_addr))
1340         comb += req_tag.eq(get_tag(req.real_addr))
1341
1342         sync += r1.use_forward1.eq(use_forward1_next)
1343         sync += r1.forward_sel.eq(0)
1344
1345         with m.If(use_forward1_next):
1346             sync += r1.forward_sel.eq(r1.req.byte_sel)
1347         with m.Elif(use_forward2_next):
1348             sync += r1.forward_sel.eq(r1.forward_sel1)
1349
1350         sync += r1.forward_data2.eq(r1.forward_data1)
1351         with m.If(r1.write_bram):
1352             sync += r1.forward_data1.eq(r1.req.data)
1353             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1354             sync += r1.forward_way1.eq(r1.req.hit_way)
1355             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1356             sync += r1.forward_valid1.eq(1)
1357         with m.Else():
1358             with m.If(r1.dcbz):
1359                 sync += r1.forward_data1.eq(0)
1360             with m.Else():
1361                 sync += r1.forward_data1.eq(bus.dat_r)
1362             sync += r1.forward_sel1.eq(~0) # all 1s
1363             sync += r1.forward_way1.eq(replace_way)
1364             sync += r1.forward_row1.eq(r1.store_row)
1365             sync += r1.forward_valid1.eq(0)
1366
1367         # One cycle pulses reset
1368         sync += r1.slow_valid.eq(0)
1369         sync += r1.write_bram.eq(0)
1370         sync += r1.inc_acks.eq(0)
1371         sync += r1.dec_acks.eq(0)
1372
1373         sync += r1.ls_valid.eq(0)
1374         # complete tlbies and TLB loads in the third cycle
1375         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1376
1377         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1378             with m.If(r0.mmu_req):
1379                 sync += r1.mmu_done.eq(1)
1380             with m.Else():
1381                 sync += r1.ls_valid.eq(1)
1382
1383         with m.If(r1.write_tag):
1384             # Store new tag in selected way
1385             replace_way_onehot = Signal(NUM_WAYS)
1386             comb += replace_way_onehot.eq(1<<replace_way)
1387             for i in range(NUM_WAYS):
1388                 with m.If(replace_way_onehot[i]):
1389                     ct = Signal(TAG_RAM_WIDTH)
1390                     comb += ct.eq(cache_tags[r1.store_index].tag)
1391                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1392                     sync += cache_tags[r1.store_index].tag.eq(ct)
1393             sync += r1.store_way.eq(replace_way)
1394             sync += r1.write_tag.eq(0)
1395
1396         # Take request from r1.req if there is one there,
1397         # else from req_op, ra, etc.
1398         with m.If(r1.full):
1399             comb += req.eq(r1.req)
1400         with m.Else():
1401             comb += req.op.eq(req_op)
1402             comb += req.valid.eq(req_go)
1403             comb += req.mmu_req.eq(r0.mmu_req)
1404             comb += req.dcbz.eq(r0.req.dcbz)
1405             comb += req.real_addr.eq(ra)
1406
1407             with m.If(r0.req.dcbz):
1408                 # force data to 0 for dcbz
1409                 comb += req.data.eq(0)
1410             with m.Elif(r0.d_valid):
1411                 comb += req.data.eq(r0.req.data)
1412             with m.Else():
1413                 comb += req.data.eq(d_in.data)
1414
1415             # Select all bytes for dcbz
1416             # and for cacheable loads
1417             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1418                 comb += req.byte_sel.eq(~0) # all 1s
1419             with m.Else():
1420                 comb += req.byte_sel.eq(r0.req.byte_sel)
1421             comb += req.hit_way.eq(req_hit_way)
1422             comb += req.same_tag.eq(req_same_tag)
1423
1424             # Store the incoming request from r0,
1425             # if it is a slow request
1426             # Note that r1.full = 1 implies req_op = OP_NONE
1427             with m.If((req_op == Op.OP_LOAD_MISS)
1428                       | (req_op == Op.OP_LOAD_NC)
1429                       | (req_op == Op.OP_STORE_MISS)
1430                       | (req_op == Op.OP_STORE_HIT)):
1431                 sync += r1.req.eq(req)
1432                 sync += r1.full.eq(1)
1433
1434         # Main state machine
1435         with m.Switch(r1.state):
1436
1437             with m.Case(State.IDLE):
1438                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1439                 sync += r1.wb.sel.eq(req.byte_sel)
1440                 sync += r1.wb.dat.eq(req.data)
1441                 sync += r1.dcbz.eq(req.dcbz)
1442
1443                 # Keep track of our index and way
1444                 # for subsequent stores.
1445                 sync += r1.store_index.eq(req_idx)
1446                 sync += r1.store_row.eq(req_row)
1447                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1448                 sync += r1.reload_tag.eq(req_tag)
1449                 sync += r1.req.same_tag.eq(1)
1450
1451                 with m.If(req.op == Op.OP_STORE_HIT):
1452                     sync += r1.store_way.eq(req.hit_way)
1453
1454                 #with m.If(r1.dec_acks):
1455                 #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
1456
1457                 # Reset per-row valid bits,
1458                 # ready for handling OP_LOAD_MISS
1459                 for i in range(ROW_PER_LINE):
1460                     sync += r1.rows_valid[i].eq(0)
1461
1462                 with m.If(req_op != Op.OP_NONE):
1463                     sync += Display("cache op %d", req.op)
1464
1465                 with m.Switch(req.op):
1466                     with m.Case(Op.OP_LOAD_HIT):
1467                         # stay in IDLE state
1468                         pass
1469
1470                     with m.Case(Op.OP_LOAD_MISS):
1471                         sync += Display("cache miss real addr: %x " \
1472                                 "idx: %x tag: %x",
1473                                 req.real_addr, req_row, req_tag)
1474
1475                         # Start the wishbone cycle
1476                         sync += r1.wb.we.eq(0)
1477                         sync += r1.wb.cyc.eq(1)
1478                         sync += r1.wb.stb.eq(1)
1479
1480                         # Track that we had one request sent
1481                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1482                         sync += r1.write_tag.eq(1)
1483
1484                     with m.Case(Op.OP_LOAD_NC):
1485                         sync += r1.wb.cyc.eq(1)
1486                         sync += r1.wb.stb.eq(1)
1487                         sync += r1.wb.we.eq(0)
1488                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1489
1490                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1491                         with m.If(~req.dcbz):
1492                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1493                             sync += r1.acks_pending.eq(1)
1494                             sync += r1.full.eq(0)
1495                             sync += r1.slow_valid.eq(1)
1496
1497                             with m.If(req.mmu_req):
1498                                 sync += r1.mmu_done.eq(1)
1499                             with m.Else():
1500                                 sync += r1.ls_valid.eq(1)
1501
1502                             with m.If(req.op == Op.OP_STORE_HIT):
1503                                 sync += r1.write_bram.eq(1)
1504                         with m.Else():
1505                             # dcbz is handled much like a load miss except
1506                             # that we are writing to memory instead of reading
1507                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1508
1509                             with m.If(req.op == Op.OP_STORE_MISS):
1510                                 sync += r1.write_tag.eq(1)
1511
1512                         sync += r1.wb.we.eq(1)
1513                         sync += r1.wb.cyc.eq(1)
1514                         sync += r1.wb.stb.eq(1)
1515
1516                     # OP_NONE and OP_BAD do nothing
1517                     # OP_BAD & OP_STCX_FAIL were
1518                     # handled above already
1519                     with m.Case(Op.OP_NONE):
1520                         pass
1521                     with m.Case(Op.OP_BAD):
1522                         pass
1523                     with m.Case(Op.OP_STCX_FAIL):
1524                         pass
1525
1526             with m.Case(State.RELOAD_WAIT_ACK):
1527                 ld_stbs_done = Signal()
1528                 # Requests are all sent if stb is 0
1529                 comb += ld_stbs_done.eq(~r1.wb.stb)
1530
1531                 # If we are still sending requests, was one accepted?
1532                 with m.If((~bus.stall) & r1.wb.stb):
1533                     # That was the last word?  We are done sending.
1534                     # Clear stb and set ld_stbs_done so we can handle an
1535                     # eventual last ack on the same cycle.
1536                     # sigh - reconstruct wb adr with 3 extra 0s at front
1537                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1538                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1539                         sync += r1.wb.stb.eq(0)
1540                         comb += ld_stbs_done.eq(1)
1541
1542                     # Calculate the next row address in the current cache line
1543                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1544                     comb += row.eq(r1.wb.adr)
1545                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1546
1547                 # Incoming acks processing
1548                 sync += r1.forward_valid1.eq(bus.ack)
1549                 with m.If(bus.ack):
1550                     srow = Signal(ROW_LINE_BITS)
1551                     comb += srow.eq(r1.store_row)
1552                     sync += r1.rows_valid[srow].eq(1)
1553
1554                     # If this is the data we were looking for,
1555                     # we can complete the request next cycle.
1556                     # Compare the whole address in case the
1557                     # request in r1.req is not the one that
1558                     # started this refill.
1559                     with m.If(req.valid & r1.req.same_tag &
1560                               ((r1.dcbz & r1.req.dcbz) |
1561                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1562                                 (r1.store_row == get_row(req.real_addr))):
1563                         sync += r1.full.eq(0)
1564                         sync += r1.slow_valid.eq(1)
1565                         with m.If(r1.mmu_req):
1566                             sync += r1.mmu_done.eq(1)
1567                         with m.Else():
1568                             sync += r1.ls_valid.eq(1)
1569                         sync += r1.forward_sel.eq(~0) # all 1s
1570                         sync += r1.use_forward1.eq(1)
1571
1572                     # Check for completion
1573                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1574                                                       r1.end_row_ix)):
1575                         # Complete wishbone cycle
1576                         sync += r1.wb.cyc.eq(0)
1577
1578                         # Cache line is now valid
1579                         cv = Signal(INDEX_BITS)
1580                         comb += cv.eq(cache_tags[r1.store_index].valid)
1581                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1582                         sync += cache_tags[r1.store_index].valid.eq(cv)
1583
1584                         sync += r1.state.eq(State.IDLE)
1585                         sync += Display("cache valid set %x "
1586                                         "idx %d way %d",
1587                                          cv, r1.store_index, r1.store_way)
1588
1589                     # Increment store row counter
1590                     sync += r1.store_row.eq(next_row(r1.store_row))
1591
1592             with m.Case(State.STORE_WAIT_ACK):
1593                 st_stbs_done = Signal()
1594                 adjust_acks = Signal(3)
1595
1596                 comb += st_stbs_done.eq(~r1.wb.stb)
1597
1598                 with m.If(r1.inc_acks != r1.dec_acks):
1599                     with m.If(r1.inc_acks):
1600                         comb += adjust_acks.eq(r1.acks_pending + 1)
1601                     with m.Else():
1602                         comb += adjust_acks.eq(r1.acks_pending - 1)
1603                 with m.Else():
1604                     comb += adjust_acks.eq(r1.acks_pending)
1605
1606                 sync += r1.acks_pending.eq(adjust_acks)
1607
1608                 # Clear stb when slave accepted request
1609                 with m.If(~bus.stall):
1610                     # See if there is another store waiting
1611                     # to be done which is in the same real page.
1612                     with m.If(req.valid):
1613                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1614                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1615                         sync += r1.wb.dat.eq(req.data)
1616                         sync += r1.wb.sel.eq(req.byte_sel)
1617
1618                     with m.If((adjust_acks < 7) & req.same_tag &
1619                                 ((req.op == Op.OP_STORE_MISS) |
1620                                  (req.op == Op.OP_STORE_HIT))):
1621                         sync += r1.wb.stb.eq(1)
1622                         comb += st_stbs_done.eq(0)
1623                         sync += r1.store_way.eq(req.hit_way)
1624                         sync += r1.store_row.eq(get_row(req.real_addr))
1625
1626                         with m.If(req.op == Op.OP_STORE_HIT):
1627                             sync += r1.write_bram.eq(1)
1628                         sync += r1.full.eq(0)
1629                         sync += r1.slow_valid.eq(1)
1630
1631                         # Store requests never come from the MMU
1632                         sync += r1.ls_valid.eq(1)
1633                         comb += st_stbs_done.eq(0)
1634                         sync += r1.inc_acks.eq(1)
1635                     with m.Else():
1636                         sync += r1.wb.stb.eq(0)
1637                         comb += st_stbs_done.eq(1)
1638
1639                 # Got ack ? See if complete.
1640                 sync += Display("got ack %d %d stbs %d adjust_acks %d",
1641                                 bus.ack, bus.ack, st_stbs_done, adjust_acks)
1642                 with m.If(bus.ack):
1643                     with m.If(st_stbs_done & (adjust_acks == 1)):
1644                         sync += r1.state.eq(State.IDLE)
1645                         sync += r1.wb.cyc.eq(0)
1646                         sync += r1.wb.stb.eq(0)
1647                     sync += r1.dec_acks.eq(1)
1648
1649             with m.Case(State.NC_LOAD_WAIT_ACK):
1650                 # Clear stb when slave accepted request
1651                 with m.If(~bus.stall):
1652                     sync += r1.wb.stb.eq(0)
1653
1654                 # Got ack ? complete.
1655                 with m.If(bus.ack):
1656                     sync += r1.state.eq(State.IDLE)
1657                     sync += r1.full.eq(0)
1658                     sync += r1.slow_valid.eq(1)
1659
1660                     with m.If(r1.mmu_req):
1661                         sync += r1.mmu_done.eq(1)
1662                     with m.Else():
1663                         sync += r1.ls_valid.eq(1)
1664
1665                     sync += r1.forward_sel.eq(~0) # all 1s
1666                     sync += r1.use_forward1.eq(1)
1667                     sync += r1.wb.cyc.eq(0)
1668                     sync += r1.wb.stb.eq(0)
1669
1670     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1671
1672         sync = m.d.sync
1673         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1674
1675         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1676                                stall_out, req_op[:3], d_out.valid, d_out.error,
1677                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1678                                r1.real_adr[3:6]))
1679
1680     def elaborate(self, platform):
1681
1682         m = Module()
1683         comb = m.d.comb
1684         d_in = self.d_in
1685
1686         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1687         cache_tags       = CacheTagArray()
1688         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1689
1690         # TODO attribute ram_style : string;
1691         # TODO attribute ram_style of cache_tags : signal is "distributed";
1692
1693         """note: these are passed to nmigen.hdl.Memory as "attributes".
1694            don't know how, just that they are.
1695         """
1696         # TODO attribute ram_style of
1697         #  dtlb_tags : signal is "distributed";
1698         # TODO attribute ram_style of
1699         #  dtlb_ptes : signal is "distributed";
1700
1701         r0      = RegStage0("r0")
1702         r0_full = Signal()
1703
1704         r1 = RegStage1("r1")
1705
1706         reservation = Reservation()
1707
1708         # Async signals on incoming request
1709         req_index    = Signal(INDEX_BITS)
1710         req_row      = Signal(ROW_BITS)
1711         req_hit_way  = Signal(WAY_BITS)
1712         req_tag      = Signal(TAG_BITS)
1713         req_op       = Signal(Op)
1714         req_data     = Signal(64)
1715         req_same_tag = Signal()
1716         req_go       = Signal()
1717
1718         early_req_row     = Signal(ROW_BITS)
1719
1720         cancel_store      = Signal()
1721         set_rsrv          = Signal()
1722         clear_rsrv        = Signal()
1723
1724         r0_valid          = Signal()
1725         r0_stall          = Signal()
1726
1727         use_forward1_next = Signal()
1728         use_forward2_next = Signal()
1729
1730         cache_out_row     = Signal(WB_DATA_BITS)
1731
1732         plru_victim       = Signal(WAY_BITS)
1733         replace_way       = Signal(WAY_BITS)
1734
1735         # Wishbone read/write/cache write formatting signals
1736         bus_sel           = Signal(8)
1737
1738         # TLB signals
1739         tlb_way       = TLBRecord("tlb_way")
1740         tlb_req_index = Signal(TLB_SET_BITS)
1741         tlb_hit       = TLBHit("tlb_hit")
1742         pte           = Signal(TLB_PTE_BITS)
1743         ra            = Signal(REAL_ADDR_BITS)
1744         valid_ra      = Signal()
1745         perm_attr     = PermAttr("dc_perms")
1746         rc_ok         = Signal()
1747         perm_ok       = Signal()
1748         access_ok     = Signal()
1749
1750         tlb_plru_victim = Signal(TLB_WAY_BITS)
1751
1752         # we don't yet handle collisions between loadstore1 requests
1753         # and MMU requests
1754         comb += self.m_out.stall.eq(0)
1755
1756         # Hold off the request in r0 when r1 has an uncompleted request
1757         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1758         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1759         comb += self.stall_out.eq(r0_stall)
1760
1761         # deal with litex not doing wishbone pipeline mode
1762         # XXX in wrong way.  FIFOs are needed in the SRAM test
1763         # so that stb/ack match up. same thing done in icache.py
1764         if not self.microwatt_compat:
1765             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1766
1767         # Wire up wishbone request latch out of stage 1
1768         comb += self.bus.we.eq(r1.wb.we)
1769         comb += self.bus.adr.eq(r1.wb.adr)
1770         comb += self.bus.sel.eq(r1.wb.sel)
1771         comb += self.bus.stb.eq(r1.wb.stb)
1772         comb += self.bus.dat_w.eq(r1.wb.dat)
1773         comb += self.bus.cyc.eq(r1.wb.cyc)
1774
1775         # create submodule TLBUpdate
1776         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1777
1778         # call sub-functions putting everything together, using shared
1779         # signals established above
1780         self.stage_0(m, r0, r1, r0_full)
1781         self.tlb_read(m, r0_stall, tlb_way)
1782         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1783                         tlb_way,
1784                         pte, tlb_hit, valid_ra, perm_attr, ra)
1785         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1786                         tlb_hit, tlb_plru_victim)
1787         self.maybe_plrus(m, r1, plru_victim)
1788         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1789         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1790         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1791                            r0_valid, r1, cache_tags, replace_way,
1792                            use_forward1_next, use_forward2_next,
1793                            req_hit_way, plru_victim, rc_ok, perm_attr,
1794                            valid_ra, perm_ok, access_ok, req_op, req_go,
1795                            tlb_hit, tlb_way, cache_tag_set,
1796                            cancel_store, req_same_tag, r0_stall, early_req_row)
1797         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1798                            r0_valid, r0, reservation)
1799         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1800                            reservation, r0)
1801         self.writeback_control(m, r1, cache_out_row)
1802         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1803         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1804                         req_hit_way, req_index, req_tag, access_ok,
1805                         tlb_hit, tlb_req_index)
1806         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1807                     r0, replace_way,
1808                     req_hit_way, req_same_tag,
1809                          r0_valid, req_op, cache_tags, req_go, ra)
1810         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1811
1812         return m
1813
1814
1815 if __name__ == '__main__':
1816     dut = DCache()
1817     vl = rtlil.convert(dut, ports=[])
1818     with open("test_dcache.il", "w") as f:
1819         f.write(vl)