src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  17   (discussion about brams for ECP5)
  18
  19 """
  20
  21 import sys
  22
  23 from nmutil.gtkw import write_gtkw
  24
  25 sys.setrecursionlimit(1000000)
  26
  27 from enum import Enum, unique
  28
  29 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  30                     Record, Memory)
  31 from nmutil.util import Display
  32 from nmigen.lib.coding import Decoder
  33
  34 from copy import deepcopy
  35 from random import randint, seed
  36
  37 from nmigen_soc.wishbone.bus import Interface
  38
  39 from nmigen.cli import main
  40 from nmutil.iocontrol import RecordObject
  41 from nmigen.utils import log2_int
  42 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  43                                      DCacheToLoadStore1Type,
  44                                      MMUToDCacheType,
  45                                      DCacheToMMUType)
  46
  47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  48                                 WBAddrType, WBDataType, WBSelType,
  49                                 WBMasterOut, WBSlaveOut,
  50                                 WBMasterOutVector, WBSlaveOutVector,
  51                                 WBIOMasterOut, WBIOSlaveOut)
  52
  53 from soc.experiment.cache_ram import CacheRam
  54 from soc.experiment.plru import PLRU, PLRUs
  55 #from nmutil.plru import PLRU, PLRUs
  56
  57 # for test
  58 from soc.bus.sram import SRAM
  59 from nmigen import Memory
  60 from nmigen.cli import rtlil
  61
  62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  64 from nmutil.sim_tmp_alternative import Simulator
  65
  66 from nmutil.util import wrap
  67
  68
  69 # TODO: make these parameters of DCache at some point
  70 LINE_SIZE = 64    # Line size in bytes
  71 NUM_LINES = 32    # Number of lines in a set
  72 NUM_WAYS = 4      # Number of ways
  73 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  74 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  75 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  76 LOG_LENGTH = 0    # Non-zero to enable log data collection
  77
  78 # BRAM organisation: We never access more than
  79 #     -- WB_DATA_BITS at a time so to save
  80 #     -- resources we make the array only that wide, and
  81 #     -- use consecutive indices to make a cache "line"
  82 #     --
  83 #     -- ROW_SIZE is the width in bytes of the BRAM
  84 #     -- (based on WB, so 64-bits)
  85 ROW_SIZE = WB_DATA_BITS // 8;
  86
  87 # ROW_PER_LINE is the number of row (wishbone
  88 # transactions) in a line
  89 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  90
  91 # BRAM_ROWS is the number of rows in BRAM needed
  92 # to represent the full dcache
  93 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  94
  95 print ("ROW_SIZE", ROW_SIZE)
  96 print ("ROW_PER_LINE", ROW_PER_LINE)
  97 print ("BRAM_ROWS", BRAM_ROWS)
  98 print ("NUM_WAYS", NUM_WAYS)
  99
 100 # Bit fields counts in the address
 101
 102 # REAL_ADDR_BITS is the number of real address
 103 # bits that we store
 104 REAL_ADDR_BITS = 56
 105
 106 # ROW_BITS is the number of bits to select a row
 107 ROW_BITS = log2_int(BRAM_ROWS)
 108
 109 # ROW_LINE_BITS is the number of bits to select
 110 # a row within a line
 111 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 112
 113 # LINE_OFF_BITS is the number of bits for
 114 # the offset in a cache line
 115 LINE_OFF_BITS = log2_int(LINE_SIZE)
 116
 117 # ROW_OFF_BITS is the number of bits for
 118 # the offset in a row
 119 ROW_OFF_BITS = log2_int(ROW_SIZE)
 120
 121 # INDEX_BITS is the number if bits to
 122 # select a cache line
 123 INDEX_BITS = log2_int(NUM_LINES)
 124
 125 # SET_SIZE_BITS is the log base 2 of the set size
 126 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 127
 128 # TAG_BITS is the number of bits of
 129 # the tag part of the address
 130 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 131
 132 # TAG_WIDTH is the width in bits of each way of the tag RAM
 133 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 134
 135 # WAY_BITS is the number of bits to select a way
 136 WAY_BITS = log2_int(NUM_WAYS)
 137
 138 # Example of layout for 32 lines of 64 bytes:
 139 layout = f"""\
 140   DCache Layout:
 141  |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
 142   ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
 143   ..  tag    |index|  line  |
 144   ..         |   row   |    |
 145   ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
 146   ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
 147   ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
 148   ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
 149   ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
 150   .. --------|              | TAG_BITS      ({TAG_BITS})
 151 """
 152 print (layout)
 153 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 154             (TAG_BITS, INDEX_BITS, ROW_BITS,
 155              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 156 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 157 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 158 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 159
 160 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 161
 162 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 163 print ("    TAG_WIDTH", TAG_WIDTH)
 164 print ("     NUM_WAYS", NUM_WAYS)
 165 print ("    NUM_LINES", NUM_LINES)
 166
 167
 168 def CacheTag(name=None):
 169     tag_layout = [('valid', NUM_WAYS),
 170                   ('tag', TAG_RAM_WIDTH),
 171                  ]
 172     return Record(tag_layout, name=name)
 173
 174
 175 def CacheTagArray():
 176     return Array(CacheTag(name="tag%d" % x) for x in range(NUM_LINES))
 177
 178
 179 def RowPerLineValidArray():
 180     return Array(Signal(name="rows_valid%d" % x) \
 181                         for x in range(ROW_PER_LINE))
 182
 183
 184 # L1 TLB
 185 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 186 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 187 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 188 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 189 TLB_PTE_BITS     = 64
 190 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 191
 192 def ispow2(x):
 193     return (1<<log2_int(x, False)) == x
 194
 195 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 196 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 197 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 198 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 199 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 200 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 201         "geometry bits don't add up"
 202 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 203         "geometry bits don't add up"
 204 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 205          "geometry bits don't add up"
 206 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 207 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 208
 209
 210 def TLBHit(name):
 211     return Record([('valid', 1),
 212                    ('way', TLB_WAY_BITS)], name=name)
 213
 214 def TLBTagEAArray():
 215     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 216                 for x in range (TLB_NUM_WAYS))
 217
 218 def TLBRecord(name):
 219     tlb_layout = [('valid', TLB_NUM_WAYS),
 220                   ('tag', TLB_TAG_WAY_BITS),
 221                   ('pte', TLB_PTE_WAY_BITS)
 222                  ]
 223     return Record(tlb_layout, name=name)
 224
 225 def TLBValidArray():
 226     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 227                         for x in range(TLB_SET_SIZE))
 228
 229 def HitWaySet():
 230     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 231                         for x in range(TLB_NUM_WAYS))
 232
 233 # Cache RAM interface
 234 def CacheRamOut():
 235     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 236                  for x in range(NUM_WAYS))
 237
 238 # PLRU output interface
 239 def PLRUOut():
 240     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 241                 for x in range(NUM_LINES))
 242
 243 # TLB PLRU output interface
 244 def TLBPLRUOut():
 245     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 246                 for x in range(TLB_SET_SIZE))
 247
 248 # Helper functions to decode incoming requests
 249 #
 250 # Return the cache line index (tag index) for an address
 251 def get_index(addr):
 252     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 253
 254 # Return the cache row index (data memory) for an address
 255 def get_row(addr):
 256     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 257
 258 # Return the index of a row within a line
 259 def get_row_of_line(row):
 260     return row[:ROW_BITS][:ROW_LINE_BITS]
 261
 262 # Returns whether this is the last row of a line
 263 def is_last_row_addr(addr, last):
 264     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 265
 266 # Returns whether this is the last row of a line
 267 def is_last_row(row, last):
 268     return get_row_of_line(row) == last
 269
 270 # Return the next row in the current cache line. We use a
 271 # dedicated function in order to limit the size of the
 272 # generated adder to be only the bits within a cache line
 273 # (3 bits with default settings)
 274 def next_row(row):
 275     row_v = row[0:ROW_LINE_BITS] + 1
 276     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 277
 278 # Get the tag value from the address
 279 def get_tag(addr):
 280     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 281
 282 # Read a tag from a tag memory row
 283 def read_tag(way, tagset):
 284     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 285
 286 # Read a TLB tag from a TLB tag memory row
 287 def read_tlb_tag(way, tags):
 288     return tags.word_select(way, TLB_EA_TAG_BITS)
 289
 290 # Write a TLB tag to a TLB tag memory row
 291 def write_tlb_tag(way, tags, tag):
 292     return read_tlb_tag(way, tags).eq(tag)
 293
 294 # Read a PTE from a TLB PTE memory row
 295 def read_tlb_pte(way, ptes):
 296     return ptes.word_select(way, TLB_PTE_BITS)
 297
 298 def write_tlb_pte(way, ptes, newpte):
 299     return read_tlb_pte(way, ptes).eq(newpte)
 300
 301
 302 # Record for storing permission, attribute, etc. bits from a PTE
 303 class PermAttr(RecordObject):
 304     def __init__(self, name=None):
 305         super().__init__(name=name)
 306         self.reference = Signal()
 307         self.changed   = Signal()
 308         self.nocache   = Signal()
 309         self.priv      = Signal()
 310         self.rd_perm   = Signal()
 311         self.wr_perm   = Signal()
 312
 313
 314 def extract_perm_attr(pte):
 315     pa = PermAttr()
 316     return pa;
 317
 318
 319 # Type of operation on a "valid" input
 320 @unique
 321 class Op(Enum):
 322     OP_NONE       = 0
 323     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 324     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 325     OP_LOAD_HIT   = 3 # Cache hit on load
 326     OP_LOAD_MISS  = 4 # Load missing cache
 327     OP_LOAD_NC    = 5 # Non-cachable load
 328     OP_STORE_HIT  = 6 # Store hitting cache
 329     OP_STORE_MISS = 7 # Store missing cache
 330
 331
 332 # Cache state machine
 333 @unique
 334 class State(Enum):
 335     IDLE             = 0 # Normal load hit processing
 336     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 337     STORE_WAIT_ACK   = 2 # Store wait ack
 338     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 339
 340
 341 # Dcache operations:
 342 #
 343 # In order to make timing, we use the BRAMs with
 344 # an output buffer, which means that the BRAM
 345 # output is delayed by an extra cycle.
 346 #
 347 # Thus, the dcache has a 2-stage internal pipeline
 348 # for cache hits with no stalls.
 349 #
 350 # All other operations are handled via stalling
 351 # in the first stage.
 352 #
 353 # The second stage can thus complete a hit at the same
 354 # time as the first stage emits a stall for a complex op.
 355 #
 356 # Stage 0 register, basically contains just the latched request
 357
 358 class RegStage0(RecordObject):
 359     def __init__(self, name=None):
 360         super().__init__(name=name)
 361         self.req     = LoadStore1ToDCacheType(name="lsmem")
 362         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 363         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 364         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 365         self.mmu_req = Signal() # indicates source of request
 366         self.d_valid = Signal() # indicates req.data is valid now
 367
 368
 369 class MemAccessRequest(RecordObject):
 370     def __init__(self, name=None):
 371         super().__init__(name=name)
 372         self.op        = Signal(Op)
 373         self.valid     = Signal()
 374         self.dcbz      = Signal()
 375         self.real_addr = Signal(REAL_ADDR_BITS)
 376         self.data      = Signal(64)
 377         self.byte_sel  = Signal(8)
 378         self.hit_way   = Signal(WAY_BITS)
 379         self.same_tag  = Signal()
 380         self.mmu_req   = Signal()
 381
 382
 383 # First stage register, contains state for stage 1 of load hits
 384 # and for the state machine used by all other operations
 385 class RegStage1(RecordObject):
 386     def __init__(self, name=None):
 387         super().__init__(name=name)
 388         # Info about the request
 389         self.full             = Signal() # have uncompleted request
 390         self.mmu_req          = Signal() # request is from MMU
 391         self.req              = MemAccessRequest(name="reqmem")
 392
 393         # Cache hit state
 394         self.hit_way          = Signal(WAY_BITS)
 395         self.hit_load_valid   = Signal()
 396         self.hit_index        = Signal(INDEX_BITS)
 397         self.cache_hit        = Signal()
 398
 399         # TLB hit state
 400         self.tlb_hit          = TLBHit("tlb_hit")
 401         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 402
 403         # 2-stage data buffer for data forwarded from writes to reads
 404         self.forward_data1    = Signal(64)
 405         self.forward_data2    = Signal(64)
 406         self.forward_sel1     = Signal(8)
 407         self.forward_valid1   = Signal()
 408         self.forward_way1     = Signal(WAY_BITS)
 409         self.forward_row1     = Signal(ROW_BITS)
 410         self.use_forward1     = Signal()
 411         self.forward_sel      = Signal(8)
 412
 413         # Cache miss state (reload state machine)
 414         self.state            = Signal(State)
 415         self.dcbz             = Signal()
 416         self.write_bram       = Signal()
 417         self.write_tag        = Signal()
 418         self.slow_valid       = Signal()
 419         self.wb               = WBMasterOut("wb")
 420         self.reload_tag       = Signal(TAG_BITS)
 421         self.store_way        = Signal(WAY_BITS)
 422         self.store_row        = Signal(ROW_BITS)
 423         self.store_index      = Signal(INDEX_BITS)
 424         self.end_row_ix       = Signal(ROW_LINE_BITS)
 425         self.rows_valid       = RowPerLineValidArray()
 426         self.acks_pending     = Signal(3)
 427         self.inc_acks         = Signal()
 428         self.dec_acks         = Signal()
 429
 430         # Signals to complete (possibly with error)
 431         self.ls_valid         = Signal()
 432         self.ls_error         = Signal()
 433         self.mmu_done         = Signal()
 434         self.mmu_error        = Signal()
 435         self.cache_paradox    = Signal()
 436
 437         # Signal to complete a failed stcx.
 438         self.stcx_fail        = Signal()
 439
 440
 441 # Reservation information
 442 class Reservation(RecordObject):
 443     def __init__(self, name=None):
 444         super().__init__(name=name)
 445         self.valid = Signal()
 446         self.addr  = Signal(64-LINE_OFF_BITS)
 447
 448
 449 class DTLBUpdate(Elaboratable):
 450     def __init__(self):
 451         self.tlbie    = Signal()
 452         self.tlbwe    = Signal()
 453         self.doall    = Signal()
 454         self.tlb_hit     = TLBHit("tlb_hit")
 455         self.tlb_req_index = Signal(TLB_SET_BITS)
 456
 457         self.repl_way        = Signal(TLB_WAY_BITS)
 458         self.eatag           = Signal(TLB_EA_TAG_BITS)
 459         self.pte_data        = Signal(TLB_PTE_BITS)
 460
 461         # read from dtlb array
 462         self.tlb_read       = Signal()
 463         self.tlb_read_index = Signal(TLB_SET_BITS)
 464         self.tlb_way        = TLBRecord("o_tlb_way")
 465
 466     def elaborate(self, platform):
 467         m = Module()
 468         comb = m.d.comb
 469         sync = m.d.sync
 470
 471         # there are 3 parts to this:
 472         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 473         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 474         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 475         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 476         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 477         # hmmm....
 478
 479         dtlb_valid = TLBValidArray()
 480         tlb_req_index = self.tlb_req_index
 481
 482         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 483         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 484         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 485         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 486         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 487         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 488
 489         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 490         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 491         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 492         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 493                                     granularity=TLB_EA_TAG_BITS)
 494
 495         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 496         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 497         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 498                                     granularity=TLB_PTE_BITS)
 499
 500         # commented out for now, can be put in if Memory.reset can be
 501         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 502         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 503         #m.submodules.rd_valid = rd_valid = validm.read_port()
 504         #m.submodules.wr_valid = wr_valid = validm.write_port(
 505                                     #granularity=1)
 506
 507         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 508         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 509         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 510         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 511         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 512         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 513         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 514
 515         updated  = Signal()
 516         v_updated  = Signal()
 517         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 518         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 519         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 520         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 521
 522         comb += dv.eq(dtlb_valid[tlb_req_index])
 523         comb += db_out.eq(dv)
 524
 525         with m.If(self.tlbie & self.doall):
 526             # clear all valid bits at once
 527             # XXX hmmm, validm _could_ use Memory reset here...
 528             for i in range(TLB_SET_SIZE):
 529                 sync += dtlb_valid[i].eq(0)
 530         with m.Elif(self.tlbie):
 531             # invalidate just the hit_way
 532             with m.If(self.tlb_hit.valid):
 533                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 534                 comb += v_updated.eq(1)
 535         with m.Elif(self.tlbwe):
 536             # write to the requested tag and PTE
 537             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 538             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 539             # set valid bit
 540             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 541
 542             comb += updated.eq(1)
 543             comb += v_updated.eq(1)
 544
 545         # above, sometimes valid is requested to be updated but data not
 546         # therefore split them out, here.  note the granularity thing matches
 547         # with the shift-up of the eatag/pte_data into the correct TLB way.
 548         # thus is it not necessary to write the entire lot, just the portion
 549         # being altered: hence writing the *old* copy of the row is not needed
 550         with m.If(updated): # PTE and TAG to be written
 551             comb += wr_pteway.data.eq(pb_out)
 552             comb += wr_pteway.en.eq(1<<self.repl_way)
 553             comb += wr_tagway.data.eq(tb_out)
 554             comb += wr_tagway.en.eq(1<<self.repl_way)
 555         with m.If(v_updated): # Valid to be written
 556             sync += dtlb_valid[tlb_req_index].eq(db_out)
 557             #comb += wr_valid.data.eq(db_out)
 558             #comb += wr_valid.en.eq(1<<self.repl_way)
 559
 560         # select one TLB way, use a register here
 561         r_tlb_way        = TLBRecord("r_tlb_way")
 562         r_delay = Signal()
 563         sync += r_delay.eq(self.tlb_read)
 564         with m.If(self.tlb_read):
 565             sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 566         with m.If(r_delay):
 567             # on one clock delay, output the contents of the read port(s)
 568             # comb += self.tlb_way.valid.eq(rd_valid.data)
 569             comb += self.tlb_way.tag.eq(rd_tagway.data)
 570             comb += self.tlb_way.pte.eq(rd_pteway.data)
 571             # and also capture the (delayed) output...
 572             #sync += r_tlb_way.valid.eq(rd_valid.data)
 573             sync += r_tlb_way.tag.eq(rd_tagway.data)
 574             sync += r_tlb_way.pte.eq(rd_pteway.data)
 575         with m.Else():
 576             # ... so that the register can output it when no read is requested
 577             # it's rather overkill but better to be safe than sorry
 578             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 579             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 580             #comb += self.tlb_way.eq(r_tlb_way)
 581
 582         return m
 583
 584
 585 class DCachePendingHit(Elaboratable):
 586
 587     def __init__(self, tlb_way,
 588                       cache_i_validdx, cache_tag_set,
 589                     req_addr):
 590
 591         self.go          = Signal()
 592         self.virt_mode   = Signal()
 593         self.is_hit      = Signal()
 594         self.tlb_hit      = TLBHit("tlb_hit")
 595         self.hit_way     = Signal(WAY_BITS)
 596         self.rel_match   = Signal()
 597         self.req_index   = Signal(INDEX_BITS)
 598         self.reload_tag  = Signal(TAG_BITS)
 599
 600         self.tlb_way = tlb_way
 601         self.cache_i_validdx = cache_i_validdx
 602         self.cache_tag_set = cache_tag_set
 603         self.req_addr = req_addr
 604
 605     def elaborate(self, platform):
 606         m = Module()
 607         comb = m.d.comb
 608         sync = m.d.sync
 609
 610         go = self.go
 611         virt_mode = self.virt_mode
 612         is_hit = self.is_hit
 613         tlb_way = self.tlb_way
 614         cache_i_validdx = self.cache_i_validdx
 615         cache_tag_set = self.cache_tag_set
 616         req_addr = self.req_addr
 617         tlb_hit = self.tlb_hit
 618         hit_way = self.hit_way
 619         rel_match = self.rel_match
 620         req_index = self.req_index
 621         reload_tag = self.reload_tag
 622
 623         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 624                                   for i in range(TLB_NUM_WAYS))
 625         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 626                                     for i in range(TLB_NUM_WAYS))
 627         hit_way_set = HitWaySet()
 628
 629         # Test if pending request is a hit on any way
 630         # In order to make timing in virtual mode,
 631         # when we are using the TLB, we compare each
 632         # way with each of the real addresses from each way of
 633         # the TLB, and then decide later which match to use.
 634
 635         with m.If(virt_mode):
 636             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 637                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 638                 s_hit       = Signal(name="s_hit%d" % j)
 639                 s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
 640                 s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
 641                 # read the PTE, calc the Real Address, get tge tag
 642                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 643                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 644                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 645                 comb += s_tag.eq(get_tag(s_ra))
 646                 # for each way check tge tag against the cache tag set
 647                 for i in range(NUM_WAYS): # way_t
 648                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 649                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 650                                   (read_tag(i, cache_tag_set) == s_tag)
 651                                   & (tlb_way.valid[j]))
 652                     with m.If(is_tag_hit):
 653                         comb += hit_way_set[j].eq(i)
 654                         comb += s_hit.eq(1)
 655                 comb += hit_set[j].eq(s_hit)
 656                 comb += rel_matches[j].eq(s_tag == reload_tag)
 657             with m.If(tlb_hit.valid):
 658                 comb += is_hit.eq(hit_set[tlb_hit.way])
 659                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 660                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 661         with m.Else():
 662             s_tag       = Signal(TAG_BITS)
 663             comb += s_tag.eq(get_tag(req_addr))
 664             for i in range(NUM_WAYS): # way_t
 665                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 666                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 667                           (read_tag(i, cache_tag_set) == s_tag))
 668                 with m.If(is_tag_hit):
 669                     comb += hit_way.eq(i)
 670                     comb += is_hit.eq(1)
 671             with m.If(s_tag == reload_tag):
 672                 comb += rel_match.eq(1)
 673
 674         return m
 675
 676
 677 class DCache(Elaboratable):
 678     """Set associative dcache write-through
 679
 680     TODO (in no specific order):
 681     * See list in icache.vhdl
 682     * Complete load misses on the cycle when WB data comes instead of
 683       at the end of line (this requires dealing with requests coming in
 684       while not idle...)
 685     """
 686     def __init__(self, pspec=None):
 687         self.d_in      = LoadStore1ToDCacheType("d_in")
 688         self.d_out     = DCacheToLoadStore1Type("d_out")
 689
 690         self.m_in      = MMUToDCacheType("m_in")
 691         self.m_out     = DCacheToMMUType("m_out")
 692
 693         self.stall_out = Signal()
 694
 695         # standard naming (wired to non-standard for compatibility)
 696         self.bus = Interface(addr_width=32,
 697                             data_width=64,
 698                             granularity=8,
 699                             features={'stall'},
 700                             alignment=0,
 701                             name="dcache")
 702
 703         self.log_out   = Signal(20)
 704
 705         # test if microwatt compatibility is to be enabled
 706         self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
 707                                  (pspec.microwatt_compat == True))
 708
 709     def stage_0(self, m, r0, r1, r0_full):
 710         """Latch the request in r0.req as long as we're not stalling
 711         """
 712         comb = m.d.comb
 713         sync = m.d.sync
 714         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 715
 716         r = RegStage0("stage0")
 717
 718         # TODO, this goes in unit tests and formal proofs
 719         with m.If(d_in.valid & m_in.valid):
 720             sync += Display("request collision loadstore vs MMU")
 721
 722         with m.If(m_in.valid):
 723             comb += r.req.valid.eq(1)
 724             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 725             comb += r.req.dcbz.eq(0)
 726             comb += r.req.nc.eq(0)
 727             comb += r.req.reserve.eq(0)
 728             comb += r.req.virt_mode.eq(0)
 729             comb += r.req.priv_mode.eq(1)
 730             comb += r.req.addr.eq(m_in.addr)
 731             comb += r.req.data.eq(m_in.pte)
 732             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 733             comb += r.tlbie.eq(m_in.tlbie)
 734             comb += r.doall.eq(m_in.doall)
 735             comb += r.tlbld.eq(m_in.tlbld)
 736             comb += r.mmu_req.eq(1)
 737             comb += r.d_valid.eq(1)
 738             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 739                                  m_in.addr, m_in.pte, r.req.load)
 740
 741         with m.Else():
 742             comb += r.req.eq(d_in)
 743             comb += r.req.data.eq(0)
 744             comb += r.tlbie.eq(0)
 745             comb += r.doall.eq(0)
 746             comb += r.tlbld.eq(0)
 747             comb += r.mmu_req.eq(0)
 748             comb += r.d_valid.eq(0)
 749
 750         sync += r0_full.eq(0)
 751         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 752             sync += r0.eq(r)
 753             sync += r0_full.eq(r.req.valid)
 754         with m.Elif(~r0.d_valid):
 755             # Sample data the cycle after a request comes in from loadstore1.
 756             # If another request has come in already then the data will get
 757             # put directly into req.data below.
 758             sync += r0.req.data.eq(d_in.data)
 759             sync += r0.d_valid.eq(1)
 760         with m.If(d_in.valid):
 761             m.d.sync += Display("    DCACHE req cache "
 762                                 "virt %d addr %x data %x ld %d",
 763                                  r.req.virt_mode, r.req.addr,
 764                                  r.req.data, r.req.load)
 765
 766     def tlb_read(self, m, r0_stall, tlb_way):
 767         """TLB
 768         Operates in the second cycle on the request latched in r0.req.
 769         TLB updates write the entry at the end of the second cycle.
 770         """
 771         comb = m.d.comb
 772         sync = m.d.sync
 773         m_in, d_in = self.m_in, self.d_in
 774
 775         addrbits = Signal(TLB_SET_BITS)
 776
 777         amin = TLB_LG_PGSZ
 778         amax = TLB_LG_PGSZ + TLB_SET_BITS
 779
 780         with m.If(m_in.valid):
 781             comb += addrbits.eq(m_in.addr[amin : amax])
 782         with m.Else():
 783             comb += addrbits.eq(d_in.addr[amin : amax])
 784
 785         # If we have any op and the previous op isn't finished,
 786         # then keep the same output for next cycle.
 787         d = self.dtlb_update
 788         comb += d.tlb_read_index.eq(addrbits)
 789         comb += d.tlb_read.eq(~r0_stall)
 790         comb += tlb_way.eq(d.tlb_way)
 791
 792     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 793         """Generate TLB PLRUs
 794         """
 795         comb = m.d.comb
 796         sync = m.d.sync
 797
 798         if TLB_NUM_WAYS == 0:
 799             return
 800
 801         # suite of PLRUs with a selection and output mechanism
 802         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 803         m.submodules.tlb_plrus = tlb_plrus
 804         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 805         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 806         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 807         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 808         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 809
 810     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 811                    tlb_way,
 812                    pte, tlb_hit, valid_ra, perm_attr, ra):
 813
 814         comb = m.d.comb
 815
 816         hitway = Signal(TLB_WAY_BITS)
 817         hit    = Signal()
 818         eatag  = Signal(TLB_EA_TAG_BITS)
 819
 820         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 821         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 822         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 823
 824         for i in range(TLB_NUM_WAYS):
 825             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 826             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 827             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 828             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 829             with m.If(is_tag_hit):
 830                 comb += hitway.eq(i)
 831                 comb += hit.eq(1)
 832
 833         comb += tlb_hit.valid.eq(hit & r0_valid)
 834         comb += tlb_hit.way.eq(hitway)
 835
 836         with m.If(tlb_hit.valid):
 837             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 838         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 839
 840         with m.If(r0.req.virt_mode):
 841             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 842                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 843                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 844             comb += perm_attr.reference.eq(pte[8])
 845             comb += perm_attr.changed.eq(pte[7])
 846             comb += perm_attr.nocache.eq(pte[5])
 847             comb += perm_attr.priv.eq(pte[3])
 848             comb += perm_attr.rd_perm.eq(pte[2])
 849             comb += perm_attr.wr_perm.eq(pte[1])
 850         with m.Else():
 851             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 852                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 853             comb += perm_attr.reference.eq(1)
 854             comb += perm_attr.changed.eq(1)
 855             comb += perm_attr.nocache.eq(0)
 856             comb += perm_attr.priv.eq(1)
 857             comb += perm_attr.rd_perm.eq(1)
 858             comb += perm_attr.wr_perm.eq(1)
 859
 860         with m.If(valid_ra):
 861             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 862                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 863             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 864             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 865             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 866             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 867             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 868             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 869
 870     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 871                     tlb_hit, tlb_plru_victim):
 872
 873         comb = m.d.comb
 874         sync = m.d.sync
 875
 876         tlbie    = Signal()
 877         tlbwe    = Signal()
 878
 879         comb += tlbie.eq(r0_valid & r0.tlbie)
 880         comb += tlbwe.eq(r0_valid & r0.tlbld)
 881
 882         d = self.dtlb_update
 883
 884         comb += d.tlbie.eq(tlbie)
 885         comb += d.tlbwe.eq(tlbwe)
 886         comb += d.doall.eq(r0.doall)
 887         comb += d.tlb_hit.eq(tlb_hit)
 888         comb += d.tlb_req_index.eq(tlb_req_index)
 889
 890         with m.If(tlb_hit.valid):
 891             comb += d.repl_way.eq(tlb_hit.way)
 892         with m.Else():
 893             comb += d.repl_way.eq(tlb_plru_victim)
 894         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 895         comb += d.pte_data.eq(r0.req.data)
 896
 897     def maybe_plrus(self, m, r1, plru_victim):
 898         """Generate PLRUs
 899         """
 900         comb = m.d.comb
 901         sync = m.d.sync
 902
 903         if TLB_NUM_WAYS == 0:
 904             return
 905
 906         # suite of PLRUs with a selection and output mechanism
 907         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 908         comb += plrus.way.eq(r1.hit_way)
 909         comb += plrus.valid.eq(r1.cache_hit)
 910         comb += plrus.index.eq(r1.hit_index)
 911         comb += plrus.isel.eq(r1.store_index) # select victim
 912         comb += plru_victim.eq(plrus.o_index) # selected victim
 913
 914     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 915         """Cache tag RAM read port
 916         """
 917         comb = m.d.comb
 918         sync = m.d.sync
 919         m_in, d_in = self.m_in, self.d_in
 920
 921         index = Signal(INDEX_BITS)
 922
 923         with m.If(r0_stall):
 924             comb += index.eq(req_index)
 925         with m.Elif(m_in.valid):
 926             comb += index.eq(get_index(m_in.addr))
 927         with m.Else():
 928             comb += index.eq(get_index(d_in.addr))
 929         sync += cache_tag_set.eq(cache_tags[index].tag)
 930
 931     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 932                        r0_valid, r1, cache_tags, replace_way,
 933                        use_forward1_next, use_forward2_next,
 934                        req_hit_way, plru_victim, rc_ok, perm_attr,
 935                        valid_ra, perm_ok, access_ok, req_op, req_go,
 936                        tlb_hit, tlb_way, cache_tag_set,
 937                        cancel_store, req_same_tag, r0_stall, early_req_row):
 938         """Cache request parsing and hit detection
 939         """
 940
 941         comb = m.d.comb
 942         m_in, d_in = self.m_in, self.d_in
 943
 944         is_hit      = Signal()
 945         hit_way     = Signal(WAY_BITS)
 946         op          = Signal(Op)
 947         opsel       = Signal(3)
 948         go          = Signal()
 949         nc          = Signal()
 950         cache_i_validdx = Signal(NUM_WAYS)
 951
 952         # Extract line, row and tag from request
 953         comb += req_index.eq(get_index(r0.req.addr))
 954         comb += req_row.eq(get_row(r0.req.addr))
 955         comb += req_tag.eq(get_tag(ra))
 956
 957         if False: # display on comb is a bit... busy.
 958             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 959                     r0.req.addr, ra, req_index, req_tag, req_row)
 960
 961         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 962         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 963
 964         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 965                                             cache_i_validdx, cache_tag_set,
 966                                             r0.req.addr)
 967         comb += dc.tlb_hit.eq(tlb_hit)
 968         comb += dc.reload_tag.eq(r1.reload_tag)
 969         comb += dc.virt_mode.eq(r0.req.virt_mode)
 970         comb += dc.go.eq(go)
 971         comb += dc.req_index.eq(req_index)
 972
 973         comb += is_hit.eq(dc.is_hit)
 974         comb += hit_way.eq(dc.hit_way)
 975         comb += req_same_tag.eq(dc.rel_match)
 976
 977         # See if the request matches the line currently being reloaded
 978         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 979                   (req_index == r1.store_index) & req_same_tag):
 980             # For a store, consider this a hit even if the row isn't
 981             # valid since it will be by the time we perform the store.
 982             # For a load, check the appropriate row valid bit.
 983             rrow = Signal(ROW_LINE_BITS)
 984             comb += rrow.eq(req_row)
 985             valid = r1.rows_valid[rrow]
 986             comb += is_hit.eq((~r0.req.load) | valid)
 987             comb += hit_way.eq(replace_way)
 988
 989         # Whether to use forwarded data for a load or not
 990         with m.If((get_row(r1.req.real_addr) == req_row) &
 991                   (r1.req.hit_way == hit_way)):
 992             # Only need to consider r1.write_bram here, since if we
 993             # are writing refill data here, then we don't have a
 994             # cache hit this cycle on the line being refilled.
 995             # (There is the possibility that the load following the
 996             # load miss that started the refill could be to the old
 997             # contents of the victim line, since it is a couple of
 998             # cycles after the refill starts before we see the updated
 999             # cache tag. In that case we don't use the bypass.)
1000             comb += use_forward1_next.eq(r1.write_bram)
1001         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
1002             comb += use_forward2_next.eq(r1.forward_valid1)
1003
1004         # The way that matched on a hit
1005         comb += req_hit_way.eq(hit_way)
1006
1007         # The way to replace on a miss
1008         with m.If(r1.write_tag):
1009             comb += replace_way.eq(plru_victim)
1010         with m.Else():
1011             comb += replace_way.eq(r1.store_way)
1012
1013         # work out whether we have permission for this access
1014         # NB we don't yet implement AMR, thus no KUAP
1015         comb += rc_ok.eq(perm_attr.reference
1016                          & (r0.req.load | perm_attr.changed))
1017         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1018                            (perm_attr.wr_perm |
1019                               (r0.req.load & perm_attr.rd_perm)))
1020         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1021
1022         # Combine the request and cache hit status to decide what
1023         # operation needs to be done
1024         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1025         comb += op.eq(Op.OP_NONE)
1026         with m.If(go):
1027             with m.If(~access_ok):
1028                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1029                                  valid_ra, perm_ok, rc_ok)
1030                 comb += op.eq(Op.OP_BAD)
1031             with m.Elif(cancel_store):
1032                 m.d.sync += Display("DCACHE cancel store")
1033                 comb += op.eq(Op.OP_STCX_FAIL)
1034             with m.Else():
1035                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1036                                  valid_ra, nc, r0.req.load)
1037                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1038                 with m.Switch(opsel):
1039                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1040                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1041                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1042                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1043                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1044                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1045                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1046                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1047         comb += req_op.eq(op)
1048         comb += req_go.eq(go)
1049
1050         # Version of the row number that is valid one cycle earlier
1051         # in the cases where we need to read the cache data BRAM.
1052         # If we're stalling then we need to keep reading the last
1053         # row requested.
1054         with m.If(~r0_stall):
1055             with m.If(m_in.valid):
1056                 comb += early_req_row.eq(get_row(m_in.addr))
1057             with m.Else():
1058                 comb += early_req_row.eq(get_row(d_in.addr))
1059         with m.Else():
1060             comb += early_req_row.eq(req_row)
1061
1062     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1063                          r0_valid, r0, reservation):
1064         """Handle load-with-reservation and store-conditional instructions
1065         """
1066         comb = m.d.comb
1067
1068         with m.If(r0_valid & r0.req.reserve):
1069             # XXX generate alignment interrupt if address
1070             # is not aligned XXX or if r0.req.nc = '1'
1071             with m.If(r0.req.load):
1072                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1073             with m.Else():
1074                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1075                 with m.If((~reservation.valid) |
1076                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1077                     comb += cancel_store.eq(1)
1078
1079     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1080                         reservation, r0):
1081         comb = m.d.comb
1082         sync = m.d.sync
1083
1084         with m.If(r0_valid & access_ok):
1085             with m.If(clear_rsrv):
1086                 sync += reservation.valid.eq(0)
1087             with m.Elif(set_rsrv):
1088                 sync += reservation.valid.eq(1)
1089                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1090
1091     def writeback_control(self, m, r1, cache_out_row):
1092         """Return data for loads & completion control logic
1093         """
1094         comb = m.d.comb
1095         sync = m.d.sync
1096         d_out, m_out = self.d_out, self.m_out
1097
1098         data_out = Signal(64)
1099         data_fwd = Signal(64)
1100
1101         # Use the bypass if are reading the row that was
1102         # written 1 or 2 cycles ago, including for the
1103         # slow_valid = 1 case (i.e. completing a load
1104         # miss or a non-cacheable load).
1105         with m.If(r1.use_forward1):
1106             comb += data_fwd.eq(r1.forward_data1)
1107         with m.Else():
1108             comb += data_fwd.eq(r1.forward_data2)
1109
1110         comb += data_out.eq(cache_out_row)
1111
1112         for i in range(8):
1113             with m.If(r1.forward_sel[i]):
1114                 dsel = data_fwd.word_select(i, 8)
1115                 comb += data_out.word_select(i, 8).eq(dsel)
1116
1117         # DCache output to LoadStore
1118         comb += d_out.valid.eq(r1.ls_valid)
1119         comb += d_out.data.eq(data_out)
1120         comb += d_out.store_done.eq(~r1.stcx_fail)
1121         comb += d_out.error.eq(r1.ls_error)
1122         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1123
1124         # Outputs to MMU
1125         comb += m_out.done.eq(r1.mmu_done)
1126         comb += m_out.err.eq(r1.mmu_error)
1127         comb += m_out.data.eq(data_out)
1128
1129         # We have a valid load or store hit or we just completed
1130         # a slow op such as a load miss, a NC load or a store
1131         #
1132         # Note: the load hit is delayed by one cycle. However it
1133         # can still not collide with r.slow_valid (well unless I
1134         # miscalculated) because slow_valid can only be set on a
1135         # subsequent request and not on its first cycle (the state
1136         # machine must have advanced), which makes slow_valid
1137         # at least 2 cycles from the previous hit_load_valid.
1138
1139         # Sanity: Only one of these must be set in any given cycle
1140
1141         if False: # TODO: need Display to get this to work
1142             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1143             "unexpected slow_valid collision with stcx_fail"
1144
1145             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1146              "unexpected hit_load_delayed collision with slow_valid"
1147
1148         with m.If(~r1.mmu_req):
1149             # Request came from loadstore1...
1150             # Load hit case is the standard path
1151             with m.If(r1.hit_load_valid):
1152                 sync += Display("completing load hit data=%x", data_out)
1153
1154             # error cases complete without stalling
1155             with m.If(r1.ls_error):
1156                 with m.If(r1.dcbz):
1157                     sync += Display("completing dcbz with error")
1158                 with m.Else():
1159                     sync += Display("completing ld/st with error")
1160
1161             # Slow ops (load miss, NC, stores)
1162             with m.If(r1.slow_valid):
1163                 sync += Display("completing store or load miss adr=%x data=%x",
1164                                 r1.req.real_addr, data_out)
1165
1166         with m.Else():
1167             # Request came from MMU
1168             with m.If(r1.hit_load_valid):
1169                 sync += Display("completing load hit to MMU, data=%x",
1170                                 m_out.data)
1171             # error cases complete without stalling
1172             with m.If(r1.mmu_error):
1173                 sync += Display("combpleting MMU ld with error")
1174
1175             # Slow ops (i.e. load miss)
1176             with m.If(r1.slow_valid):
1177                 sync += Display("completing MMU load miss, adr=%x data=%x",
1178                                 r1.req.real_addr, m_out.data)
1179
1180     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1181         """rams
1182         Generate a cache RAM for each way. This handles the normal
1183         reads, writes from reloads and the special store-hit update
1184         path as well.
1185
1186         Note: the BRAMs have an extra read buffer, meaning the output
1187         is pipelined an extra cycle. This differs from the
1188         icache. The writeback logic needs to take that into
1189         account by using 1-cycle delayed signals for load hits.
1190         """
1191         comb = m.d.comb
1192         bus = self.bus
1193
1194         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1195         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1196         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1197         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1198                    ~r1.write_bram))
1199         comb += rwe.i.eq(replace_way)
1200
1201         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1202         comb += hwe.i.eq(r1.hit_way)
1203
1204         # this one is gated with write_bram, and replace_way_e can never be
1205         # set at the same time.  that means that do_write can OR the outputs
1206         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1207         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1208         comb += hre.i.eq(r1.req.hit_way)
1209
1210         # common Signals
1211         do_read  = Signal()
1212         wr_addr  = Signal(ROW_BITS)
1213         wr_data  = Signal(WB_DATA_BITS)
1214         wr_sel   = Signal(ROW_SIZE)
1215         rd_addr  = Signal(ROW_BITS)
1216
1217         comb += do_read.eq(1) # always enable
1218         comb += rd_addr.eq(early_req_row)
1219
1220         # Write mux:
1221         #
1222         # Defaults to wishbone read responses (cache refill)
1223         #
1224         # For timing, the mux on wr_data/sel/addr is not
1225         # dependent on anything other than the current state.
1226
1227         with m.If(r1.write_bram):
1228             # Write store data to BRAM.  This happens one
1229             # cycle after the store is in r0.
1230             comb += wr_data.eq(r1.req.data)
1231             comb += wr_sel.eq(r1.req.byte_sel)
1232             comb += wr_addr.eq(get_row(r1.req.real_addr))
1233
1234         with m.Else():
1235             # Otherwise, we might be doing a reload or a DCBZ
1236             with m.If(r1.dcbz):
1237                 comb += wr_data.eq(0)
1238             with m.Else():
1239                 comb += wr_data.eq(bus.dat_r)
1240             comb += wr_addr.eq(r1.store_row)
1241             comb += wr_sel.eq(~0) # all 1s
1242
1243         # set up Cache Rams
1244         for i in range(NUM_WAYS):
1245             do_write = Signal(name="do_wr%d" % i)
1246             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1247             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1248
1249             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1250             m.submodules["cacheram_%d" % i] = way
1251
1252             comb += way.rd_en.eq(do_read)
1253             comb += way.rd_addr.eq(rd_addr)
1254             comb += d_out.eq(way.rd_data_o)
1255             comb += way.wr_sel.eq(wr_sel_m)
1256             comb += way.wr_addr.eq(wr_addr)
1257             comb += way.wr_data.eq(wr_data)
1258
1259             # Cache hit reads
1260             with m.If(hwe.o[i]):
1261                 comb += cache_out_row.eq(d_out)
1262
1263             # these are mutually-exclusive via their Decoder-enablers
1264             # (note: Decoder-enable is inverted)
1265             comb += do_write.eq(hre.o[i] | rwe.o[i])
1266
1267             # Mask write selects with do_write since BRAM
1268             # doesn't have a global write-enable
1269             with m.If(do_write):
1270                 comb += wr_sel_m.eq(wr_sel)
1271
1272     # Cache hit synchronous machine for the easy case.
1273     # This handles load hits.
1274     # It also handles error cases (TLB miss, cache paradox)
1275     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1276                         req_hit_way, req_index, req_tag, access_ok,
1277                         tlb_hit, tlb_req_index):
1278         comb = m.d.comb
1279         sync = m.d.sync
1280
1281         with m.If(req_op != Op.OP_NONE):
1282             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1283                     req_op, r0.req.addr, r0.req.nc,
1284                     req_index, req_tag, req_hit_way)
1285
1286         with m.If(r0_valid):
1287             sync += r1.mmu_req.eq(r0.mmu_req)
1288
1289         # Fast path for load/store hits.
1290         # Set signals for the writeback controls.
1291         sync += r1.hit_way.eq(req_hit_way)
1292         sync += r1.hit_index.eq(req_index)
1293
1294         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1295         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1296                                 (req_op == Op.OP_STORE_HIT))
1297
1298         with m.If(req_op == Op.OP_BAD):
1299             sync += Display("Signalling ld/st error "
1300                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1301                             ~r0.mmu_req,r0.mmu_req,access_ok)
1302             sync += r1.ls_error.eq(~r0.mmu_req)
1303             sync += r1.mmu_error.eq(r0.mmu_req)
1304             sync += r1.cache_paradox.eq(access_ok)
1305         with m.Else():
1306             sync += r1.ls_error.eq(0)
1307             sync += r1.mmu_error.eq(0)
1308             sync += r1.cache_paradox.eq(0)
1309
1310         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1311
1312         # Record TLB hit information for updating TLB PLRU
1313         sync += r1.tlb_hit.eq(tlb_hit)
1314         sync += r1.tlb_hit_index.eq(tlb_req_index)
1315
1316     # Memory accesses are handled by this state machine:
1317     #
1318     #   * Cache load miss/reload (in conjunction with "rams")
1319     #   * Load hits for non-cachable forms
1320     #   * Stores (the collision case is handled in "rams")
1321     #
1322     # All wishbone requests generation is done here.
1323     # This machine operates at stage 1.
1324     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1325                     r0, replace_way,
1326                     req_hit_way, req_same_tag,
1327                     r0_valid, req_op, cache_tags, req_go, ra):
1328
1329         comb = m.d.comb
1330         sync = m.d.sync
1331         bus = self.bus
1332         d_in = self.d_in
1333
1334         req         = MemAccessRequest("mreq_ds")
1335
1336         r1_next_cycle = Signal()
1337         req_row = Signal(ROW_BITS)
1338         req_idx = Signal(INDEX_BITS)
1339         req_tag = Signal(TAG_BITS)
1340         comb += req_idx.eq(get_index(req.real_addr))
1341         comb += req_row.eq(get_row(req.real_addr))
1342         comb += req_tag.eq(get_tag(req.real_addr))
1343
1344         sync += r1.use_forward1.eq(use_forward1_next)
1345         sync += r1.forward_sel.eq(0)
1346
1347         with m.If(use_forward1_next):
1348             sync += r1.forward_sel.eq(r1.req.byte_sel)
1349         with m.Elif(use_forward2_next):
1350             sync += r1.forward_sel.eq(r1.forward_sel1)
1351
1352         sync += r1.forward_data2.eq(r1.forward_data1)
1353         with m.If(r1.write_bram):
1354             sync += r1.forward_data1.eq(r1.req.data)
1355             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1356             sync += r1.forward_way1.eq(r1.req.hit_way)
1357             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1358             sync += r1.forward_valid1.eq(1)
1359         with m.Else():
1360             with m.If(r1.dcbz):
1361                 sync += r1.forward_data1.eq(0)
1362             with m.Else():
1363                 sync += r1.forward_data1.eq(bus.dat_r)
1364             sync += r1.forward_sel1.eq(~0) # all 1s
1365             sync += r1.forward_way1.eq(replace_way)
1366             sync += r1.forward_row1.eq(r1.store_row)
1367             sync += r1.forward_valid1.eq(0)
1368
1369         # One cycle pulses reset
1370         sync += r1.slow_valid.eq(0)
1371         sync += r1.write_bram.eq(0)
1372         sync += r1.inc_acks.eq(0)
1373         sync += r1.dec_acks.eq(0)
1374
1375         sync += r1.ls_valid.eq(0)
1376         # complete tlbies and TLB loads in the third cycle
1377         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1378
1379         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1380             with m.If(r0.mmu_req):
1381                 sync += r1.mmu_done.eq(1)
1382             with m.Else():
1383                 sync += r1.ls_valid.eq(1)
1384
1385         with m.If(r1.write_tag):
1386             # Store new tag in selected way
1387             replace_way_onehot = Signal(NUM_WAYS)
1388             comb += replace_way_onehot.eq(1<<replace_way)
1389             for i in range(NUM_WAYS):
1390                 with m.If(replace_way_onehot[i]):
1391                     ct = Signal(TAG_RAM_WIDTH)
1392                     comb += ct.eq(cache_tags[r1.store_index].tag)
1393                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1394                     sync += cache_tags[r1.store_index].tag.eq(ct)
1395             sync += r1.store_way.eq(replace_way)
1396             sync += r1.write_tag.eq(0)
1397
1398         # Take request from r1.req if there is one there,
1399         # else from req_op, ra, etc.
1400         with m.If(r1.full):
1401             comb += req.eq(r1.req)
1402         with m.Else():
1403             comb += req.op.eq(req_op)
1404             comb += req.valid.eq(req_go)
1405             comb += req.mmu_req.eq(r0.mmu_req)
1406             comb += req.dcbz.eq(r0.req.dcbz)
1407             comb += req.real_addr.eq(ra)
1408
1409             with m.If(r0.req.dcbz):
1410                 # force data to 0 for dcbz
1411                 comb += req.data.eq(0)
1412             with m.Elif(r0.d_valid):
1413                 comb += req.data.eq(r0.req.data)
1414             with m.Else():
1415                 comb += req.data.eq(d_in.data)
1416
1417             # Select all bytes for dcbz
1418             # and for cacheable loads
1419             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1420                 comb += req.byte_sel.eq(~0) # all 1s
1421             with m.Else():
1422                 comb += req.byte_sel.eq(r0.req.byte_sel)
1423             comb += req.hit_way.eq(req_hit_way)
1424             comb += req.same_tag.eq(req_same_tag)
1425
1426             # Store the incoming request from r0,
1427             # if it is a slow request
1428             # Note that r1.full = 1 implies req_op = OP_NONE
1429             with m.If((req_op == Op.OP_LOAD_MISS)
1430                       | (req_op == Op.OP_LOAD_NC)
1431                       | (req_op == Op.OP_STORE_MISS)
1432                       | (req_op == Op.OP_STORE_HIT)):
1433                 sync += r1.req.eq(req)
1434                 sync += r1.full.eq(1)
1435                 # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
1436                 # destroy r1.req by overwriting r1.full back to zero
1437                 comb += r1_next_cycle.eq(1)
1438
1439         # Main state machine
1440         with m.Switch(r1.state):
1441
1442             with m.Case(State.IDLE):
1443                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1444                 sync += r1.wb.sel.eq(req.byte_sel)
1445                 sync += r1.wb.dat.eq(req.data)
1446                 sync += r1.dcbz.eq(req.dcbz)
1447
1448                 # Keep track of our index and way
1449                 # for subsequent stores.
1450                 sync += r1.store_index.eq(req_idx)
1451                 sync += r1.store_row.eq(req_row)
1452                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1453                 sync += r1.reload_tag.eq(req_tag)
1454                 sync += r1.req.same_tag.eq(1)
1455
1456                 with m.If(req.op == Op.OP_STORE_HIT):
1457                     sync += r1.store_way.eq(req.hit_way)
1458
1459                 #with m.If(r1.dec_acks):
1460                 #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
1461
1462                 # Reset per-row valid bits,
1463                 # ready for handling OP_LOAD_MISS
1464                 for i in range(ROW_PER_LINE):
1465                     sync += r1.rows_valid[i].eq(0)
1466
1467                 with m.If(req_op != Op.OP_NONE):
1468                     sync += Display("cache op %d", req.op)
1469
1470                 with m.Switch(req.op):
1471                     with m.Case(Op.OP_LOAD_HIT):
1472                         # stay in IDLE state
1473                         pass
1474
1475                     with m.Case(Op.OP_LOAD_MISS):
1476                         sync += Display("cache miss real addr: %x " \
1477                                 "idx: %x tag: %x",
1478                                 req.real_addr, req_row, req_tag)
1479
1480                         # Start the wishbone cycle
1481                         sync += r1.wb.we.eq(0)
1482                         sync += r1.wb.cyc.eq(1)
1483                         sync += r1.wb.stb.eq(1)
1484
1485                         # Track that we had one request sent
1486                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1487                         sync += r1.write_tag.eq(1)
1488
1489                     with m.Case(Op.OP_LOAD_NC):
1490                         sync += r1.wb.cyc.eq(1)
1491                         sync += r1.wb.stb.eq(1)
1492                         sync += r1.wb.we.eq(0)
1493                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1494
1495                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1496                         with m.If(~req.dcbz):
1497                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1498                             sync += r1.acks_pending.eq(1)
1499                             sync += r1.full.eq(0)
1500                             comb += r1_next_cycle.eq(0)
1501                             sync += r1.slow_valid.eq(1)
1502
1503                             with m.If(req.mmu_req):
1504                                 sync += r1.mmu_done.eq(1)
1505                             with m.Else():
1506                                 sync += r1.ls_valid.eq(1)
1507
1508                             with m.If(req.op == Op.OP_STORE_HIT):
1509                                 sync += r1.write_bram.eq(1)
1510                         with m.Else():
1511                             # dcbz is handled much like a load miss except
1512                             # that we are writing to memory instead of reading
1513                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1514
1515                             with m.If(req.op == Op.OP_STORE_MISS):
1516                                 sync += r1.write_tag.eq(1)
1517
1518                         sync += r1.wb.we.eq(1)
1519                         sync += r1.wb.cyc.eq(1)
1520                         sync += r1.wb.stb.eq(1)
1521
1522                     # OP_NONE and OP_BAD do nothing
1523                     # OP_BAD & OP_STCX_FAIL were
1524                     # handled above already
1525                     with m.Case(Op.OP_NONE):
1526                         pass
1527                     with m.Case(Op.OP_BAD):
1528                         pass
1529                     with m.Case(Op.OP_STCX_FAIL):
1530                         pass
1531
1532             with m.Case(State.RELOAD_WAIT_ACK):
1533                 ld_stbs_done = Signal()
1534                 # Requests are all sent if stb is 0
1535                 comb += ld_stbs_done.eq(~r1.wb.stb)
1536
1537                 # If we are still sending requests, was one accepted?
1538                 with m.If((~bus.stall) & r1.wb.stb):
1539                     # That was the last word?  We are done sending.
1540                     # Clear stb and set ld_stbs_done so we can handle an
1541                     # eventual last ack on the same cycle.
1542                     # sigh - reconstruct wb adr with 3 extra 0s at front
1543                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1544                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1545                         sync += r1.wb.stb.eq(0)
1546                         comb += ld_stbs_done.eq(1)
1547
1548                     # Calculate the next row address in the current cache line
1549                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1550                     comb += row.eq(r1.wb.adr)
1551                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1552
1553                 # Incoming acks processing
1554                 sync += r1.forward_valid1.eq(bus.ack)
1555                 with m.If(bus.ack):
1556                     srow = Signal(ROW_LINE_BITS)
1557                     comb += srow.eq(r1.store_row)
1558                     sync += r1.rows_valid[srow].eq(1)
1559
1560                     # If this is the data we were looking for,
1561                     # we can complete the request next cycle.
1562                     # Compare the whole address in case the
1563                     # request in r1.req is not the one that
1564                     # started this refill.
1565                     with m.If(r1.full & r1.req.same_tag &
1566                               ((r1.dcbz & req.dcbz) |
1567                                (r1.req.op == Op.OP_LOAD_MISS)) &
1568                                 (r1.store_row == get_row(r1.req.real_addr))):
1569                         sync += r1.full.eq(r1_next_cycle)
1570                         sync += r1.slow_valid.eq(1)
1571                         with m.If(r1.mmu_req):
1572                             sync += r1.mmu_done.eq(1)
1573                         with m.Else():
1574                             sync += r1.ls_valid.eq(1)
1575                         sync += r1.forward_sel.eq(~0) # all 1s
1576                         sync += r1.use_forward1.eq(1)
1577
1578                     # Check for completion
1579                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1580                                                       r1.end_row_ix)):
1581                         # Complete wishbone cycle
1582                         sync += r1.wb.cyc.eq(0)
1583
1584                         # Cache line is now valid
1585                         cv = Signal(INDEX_BITS)
1586                         comb += cv.eq(cache_tags[r1.store_index].valid)
1587                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1588                         sync += cache_tags[r1.store_index].valid.eq(cv)
1589
1590                         sync += r1.state.eq(State.IDLE)
1591                         sync += Display("cache valid set %x "
1592                                         "idx %d way %d",
1593                                          cv, r1.store_index, r1.store_way)
1594
1595                     # Increment store row counter
1596                     sync += r1.store_row.eq(next_row(r1.store_row))
1597
1598             with m.Case(State.STORE_WAIT_ACK):
1599                 st_stbs_done = Signal()
1600                 adjust_acks = Signal(3)
1601
1602                 comb += st_stbs_done.eq(~r1.wb.stb)
1603
1604                 with m.If(r1.inc_acks != r1.dec_acks):
1605                     with m.If(r1.inc_acks):
1606                         comb += adjust_acks.eq(r1.acks_pending + 1)
1607                     with m.Else():
1608                         comb += adjust_acks.eq(r1.acks_pending - 1)
1609                 with m.Else():
1610                     comb += adjust_acks.eq(r1.acks_pending)
1611
1612                 sync += r1.acks_pending.eq(adjust_acks)
1613
1614                 # Clear stb when slave accepted request
1615                 with m.If(~bus.stall):
1616                     # See if there is another store waiting
1617                     # to be done which is in the same real page.
1618                     with m.If(req.valid):
1619                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1620                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1621                         sync += r1.wb.dat.eq(req.data)
1622                         sync += r1.wb.sel.eq(req.byte_sel)
1623
1624                     with m.If((adjust_acks < 7) & req.same_tag &
1625                                 ((req.op == Op.OP_STORE_MISS) |
1626                                  (req.op == Op.OP_STORE_HIT))):
1627                         sync += r1.wb.stb.eq(1)
1628                         comb += st_stbs_done.eq(0)
1629                         sync += r1.store_way.eq(req.hit_way)
1630                         sync += r1.store_row.eq(get_row(req.real_addr))
1631
1632                         with m.If(req.op == Op.OP_STORE_HIT):
1633                             sync += r1.write_bram.eq(1)
1634                         sync += r1.full.eq(r1_next_cycle)
1635                         sync += r1.slow_valid.eq(1)
1636
1637                         # Store requests never come from the MMU
1638                         sync += r1.ls_valid.eq(1)
1639                         comb += st_stbs_done.eq(0)
1640                         sync += r1.inc_acks.eq(1)
1641                     with m.Else():
1642                         sync += r1.wb.stb.eq(0)
1643                         comb += st_stbs_done.eq(1)
1644
1645                 # Got ack ? See if complete.
1646                 sync += Display("got ack %d %d stbs %d adjust_acks %d",
1647                                 bus.ack, bus.ack, st_stbs_done, adjust_acks)
1648                 with m.If(bus.ack):
1649                     with m.If(st_stbs_done & (adjust_acks == 1)):
1650                         sync += r1.state.eq(State.IDLE)
1651                         sync += r1.wb.cyc.eq(0)
1652                         sync += r1.wb.stb.eq(0)
1653                     sync += r1.dec_acks.eq(1)
1654
1655             with m.Case(State.NC_LOAD_WAIT_ACK):
1656                 # Clear stb when slave accepted request
1657                 with m.If(~bus.stall):
1658                     sync += r1.wb.stb.eq(0)
1659
1660                 # Got ack ? complete.
1661                 with m.If(bus.ack):
1662                     sync += r1.state.eq(State.IDLE)
1663                     sync += r1.full.eq(r1_next_cycle)
1664                     sync += r1.slow_valid.eq(1)
1665
1666                     with m.If(r1.mmu_req):
1667                         sync += r1.mmu_done.eq(1)
1668                     with m.Else():
1669                         sync += r1.ls_valid.eq(1)
1670
1671                     sync += r1.forward_sel.eq(~0) # all 1s
1672                     sync += r1.use_forward1.eq(1)
1673                     sync += r1.wb.cyc.eq(0)
1674                     sync += r1.wb.stb.eq(0)
1675
1676     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1677
1678         sync = m.d.sync
1679         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1680
1681         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1682                                stall_out, req_op[:3], d_out.valid, d_out.error,
1683                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1684                                r1.real_adr[3:6]))
1685
1686     def elaborate(self, platform):
1687
1688         m = Module()
1689         comb = m.d.comb
1690         d_in = self.d_in
1691
1692         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1693         cache_tags       = CacheTagArray()
1694         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1695
1696         # TODO attribute ram_style : string;
1697         # TODO attribute ram_style of cache_tags : signal is "distributed";
1698
1699         """note: these are passed to nmigen.hdl.Memory as "attributes".
1700            don't know how, just that they are.
1701         """
1702         # TODO attribute ram_style of
1703         #  dtlb_tags : signal is "distributed";
1704         # TODO attribute ram_style of
1705         #  dtlb_ptes : signal is "distributed";
1706
1707         r0      = RegStage0("r0")
1708         r0_full = Signal()
1709
1710         r1 = RegStage1("r1")
1711
1712         reservation = Reservation("rsrv")
1713
1714         # Async signals on incoming request
1715         req_index    = Signal(INDEX_BITS)
1716         req_row      = Signal(ROW_BITS)
1717         req_hit_way  = Signal(WAY_BITS)
1718         req_tag      = Signal(TAG_BITS)
1719         req_op       = Signal(Op)
1720         req_data     = Signal(64)
1721         req_same_tag = Signal()
1722         req_go       = Signal()
1723
1724         early_req_row     = Signal(ROW_BITS)
1725
1726         cancel_store      = Signal()
1727         set_rsrv          = Signal()
1728         clear_rsrv        = Signal()
1729
1730         r0_valid          = Signal()
1731         r0_stall          = Signal()
1732
1733         use_forward1_next = Signal()
1734         use_forward2_next = Signal()
1735
1736         cache_out_row     = Signal(WB_DATA_BITS)
1737
1738         plru_victim       = Signal(WAY_BITS)
1739         replace_way       = Signal(WAY_BITS)
1740
1741         # Wishbone read/write/cache write formatting signals
1742         bus_sel           = Signal(8)
1743
1744         # TLB signals
1745         tlb_way       = TLBRecord("tlb_way")
1746         tlb_req_index = Signal(TLB_SET_BITS)
1747         tlb_hit       = TLBHit("tlb_hit")
1748         pte           = Signal(TLB_PTE_BITS)
1749         ra            = Signal(REAL_ADDR_BITS)
1750         valid_ra      = Signal()
1751         perm_attr     = PermAttr("dc_perms")
1752         rc_ok         = Signal()
1753         perm_ok       = Signal()
1754         access_ok     = Signal()
1755
1756         tlb_plru_victim = Signal(TLB_WAY_BITS)
1757
1758         # we don't yet handle collisions between loadstore1 requests
1759         # and MMU requests
1760         comb += self.m_out.stall.eq(0)
1761
1762         # Hold off the request in r0 when r1 has an uncompleted request
1763         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1764         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1765         comb += self.stall_out.eq(r0_stall)
1766
1767         # deal with litex not doing wishbone pipeline mode
1768         # XXX in wrong way.  FIFOs are needed in the SRAM test
1769         # so that stb/ack match up. same thing done in icache.py
1770         if not self.microwatt_compat:
1771             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1772
1773         # Wire up wishbone request latch out of stage 1
1774         comb += self.bus.we.eq(r1.wb.we)
1775         comb += self.bus.adr.eq(r1.wb.adr)
1776         comb += self.bus.sel.eq(r1.wb.sel)
1777         comb += self.bus.stb.eq(r1.wb.stb)
1778         comb += self.bus.dat_w.eq(r1.wb.dat)
1779         comb += self.bus.cyc.eq(r1.wb.cyc)
1780
1781         # create submodule TLBUpdate
1782         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1783
1784         # call sub-functions putting everything together, using shared
1785         # signals established above
1786         self.stage_0(m, r0, r1, r0_full)
1787         self.tlb_read(m, r0_stall, tlb_way)
1788         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1789                         tlb_way,
1790                         pte, tlb_hit, valid_ra, perm_attr, ra)
1791         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1792                         tlb_hit, tlb_plru_victim)
1793         self.maybe_plrus(m, r1, plru_victim)
1794         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1795         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1796         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1797                            r0_valid, r1, cache_tags, replace_way,
1798                            use_forward1_next, use_forward2_next,
1799                            req_hit_way, plru_victim, rc_ok, perm_attr,
1800                            valid_ra, perm_ok, access_ok, req_op, req_go,
1801                            tlb_hit, tlb_way, cache_tag_set,
1802                            cancel_store, req_same_tag, r0_stall, early_req_row)
1803         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1804                            r0_valid, r0, reservation)
1805         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1806                            reservation, r0)
1807         self.writeback_control(m, r1, cache_out_row)
1808         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1809         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1810                         req_hit_way, req_index, req_tag, access_ok,
1811                         tlb_hit, tlb_req_index)
1812         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1813                     r0, replace_way,
1814                     req_hit_way, req_same_tag,
1815                          r0_valid, req_op, cache_tags, req_go, ra)
1816         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1817
1818         return m
1819
1820
1821 if __name__ == '__main__':
1822     dut = DCache()
1823     vl = rtlil.convert(dut, ports=[])
1824     with open("test_dcache.il", "w") as f:
1825         f.write(vl)