src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record, Memory)
  29 from nmutil.util import Display
  30 from nmigen.lib.coding import Decoder
  31
  32 from copy import deepcopy
  33 from random import randint, seed
  34
  35 from nmigen_soc.wishbone.bus import Interface
  36
  37 from nmigen.cli import main
  38 from nmutil.iocontrol import RecordObject
  39 from nmigen.utils import log2_int
  40 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  41                                      DCacheToLoadStore1Type,
  42                                      MMUToDCacheType,
  43                                      DCacheToMMUType)
  44
  45 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  46                                 WBAddrType, WBDataType, WBSelType,
  47                                 WBMasterOut, WBSlaveOut,
  48                                 WBMasterOutVector, WBSlaveOutVector,
  49                                 WBIOMasterOut, WBIOSlaveOut)
  50
  51 from soc.experiment.cache_ram import CacheRam
  52 #from soc.experiment.plru import PLRU
  53 from nmutil.plru import PLRU, PLRUs
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmigen.cli import rtlil
  59
  60 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  61 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  62 from nmutil.sim_tmp_alternative import Simulator
  63
  64 from nmutil.util import wrap
  65
  66
  67 # TODO: make these parameters of DCache at some point
  68 LINE_SIZE = 64    # Line size in bytes
  69 NUM_LINES = 16    # Number of lines in a set
  70 NUM_WAYS = 4      # Number of ways
  71 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  72 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  73 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  74 LOG_LENGTH = 0    # Non-zero to enable log data collection
  75
  76 # BRAM organisation: We never access more than
  77 #     -- WB_DATA_BITS at a time so to save
  78 #     -- resources we make the array only that wide, and
  79 #     -- use consecutive indices to make a cache "line"
  80 #     --
  81 #     -- ROW_SIZE is the width in bytes of the BRAM
  82 #     -- (based on WB, so 64-bits)
  83 ROW_SIZE = WB_DATA_BITS // 8;
  84
  85 # ROW_PER_LINE is the number of row (wishbone
  86 # transactions) in a line
  87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  88
  89 # BRAM_ROWS is the number of rows in BRAM needed
  90 # to represent the full dcache
  91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  92
  93 print ("ROW_SIZE", ROW_SIZE)
  94 print ("ROW_PER_LINE", ROW_PER_LINE)
  95 print ("BRAM_ROWS", BRAM_ROWS)
  96 print ("NUM_WAYS", NUM_WAYS)
  97
  98 # Bit fields counts in the address
  99
 100 # REAL_ADDR_BITS is the number of real address
 101 # bits that we store
 102 REAL_ADDR_BITS = 56
 103
 104 # ROW_BITS is the number of bits to select a row
 105 ROW_BITS = log2_int(BRAM_ROWS)
 106
 107 # ROW_LINE_BITS is the number of bits to select
 108 # a row within a line
 109 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 110
 111 # LINE_OFF_BITS is the number of bits for
 112 # the offset in a cache line
 113 LINE_OFF_BITS = log2_int(LINE_SIZE)
 114
 115 # ROW_OFF_BITS is the number of bits for
 116 # the offset in a row
 117 ROW_OFF_BITS = log2_int(ROW_SIZE)
 118
 119 # INDEX_BITS is the number if bits to
 120 # select a cache line
 121 INDEX_BITS = log2_int(NUM_LINES)
 122
 123 # SET_SIZE_BITS is the log base 2 of the set size
 124 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 125
 126 # TAG_BITS is the number of bits of
 127 # the tag part of the address
 128 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 129
 130 # TAG_WIDTH is the width in bits of each way of the tag RAM
 131 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 132
 133 # WAY_BITS is the number of bits to select a way
 134 WAY_BITS = log2_int(NUM_WAYS)
 135
 136 # Example of layout for 32 lines of 64 bytes:
 137 layout = f"""\
 138   DCache Layout:
 139  |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
 140   ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
 141   ..  tag    |index|  line  |
 142   ..         |   row   |    |
 143   ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
 144   ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
 145   ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
 146   ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
 147   ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
 148   .. --------|              | TAG_BITS      ({TAG_BITS})
 149 """
 150 print (layout)
 151 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 152             (TAG_BITS, INDEX_BITS, ROW_BITS,
 153              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 154 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 155 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 156 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 157
 158 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 159
 160 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 161 print ("    TAG_WIDTH", TAG_WIDTH)
 162 print ("     NUM_WAYS", NUM_WAYS)
 163 print ("    NUM_LINES", NUM_LINES)
 164
 165 def CacheTagArray():
 166     tag_layout = [('valid', NUM_WAYS),
 167                   ('tag', TAG_RAM_WIDTH),
 168                  ]
 169     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 170
 171 def RowPerLineValidArray():
 172     return Array(Signal(name="rows_valid%d" % x) \
 173                         for x in range(ROW_PER_LINE))
 174
 175 # L1 TLB
 176 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 177 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 178 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 179 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 180 TLB_PTE_BITS     = 64
 181 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 182
 183 def ispow2(x):
 184     return (1<<log2_int(x, False)) == x
 185
 186 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 187 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 188 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 189 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 190 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 191 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 192         "geometry bits don't add up"
 193 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 194         "geometry bits don't add up"
 195 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 196          "geometry bits don't add up"
 197 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 198 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 199
 200
 201 def TLBHit(name):
 202     return Record([('valid', 1),
 203                    ('way', TLB_WAY_BITS)], name=name)
 204
 205 def TLBTagEAArray():
 206     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 207                 for x in range (TLB_NUM_WAYS))
 208
 209 def TLBRecord(name):
 210     tlb_layout = [('valid', TLB_NUM_WAYS),
 211                   ('tag', TLB_TAG_WAY_BITS),
 212                   ('pte', TLB_PTE_WAY_BITS)
 213                  ]
 214     return Record(tlb_layout, name=name)
 215
 216 def TLBValidArray():
 217     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 218                         for x in range(TLB_SET_SIZE))
 219
 220 def HitWaySet():
 221     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 222                         for x in range(TLB_NUM_WAYS))
 223
 224 # Cache RAM interface
 225 def CacheRamOut():
 226     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 227                  for x in range(NUM_WAYS))
 228
 229 # PLRU output interface
 230 def PLRUOut():
 231     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 232                 for x in range(NUM_LINES))
 233
 234 # TLB PLRU output interface
 235 def TLBPLRUOut():
 236     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 237                 for x in range(TLB_SET_SIZE))
 238
 239 # Helper functions to decode incoming requests
 240 #
 241 # Return the cache line index (tag index) for an address
 242 def get_index(addr):
 243     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 244
 245 # Return the cache row index (data memory) for an address
 246 def get_row(addr):
 247     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 248
 249 # Return the index of a row within a line
 250 def get_row_of_line(row):
 251     return row[:ROW_BITS][:ROW_LINE_BITS]
 252
 253 # Returns whether this is the last row of a line
 254 def is_last_row_addr(addr, last):
 255     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 256
 257 # Returns whether this is the last row of a line
 258 def is_last_row(row, last):
 259     return get_row_of_line(row) == last
 260
 261 # Return the next row in the current cache line. We use a
 262 # dedicated function in order to limit the size of the
 263 # generated adder to be only the bits within a cache line
 264 # (3 bits with default settings)
 265 def next_row(row):
 266     row_v = row[0:ROW_LINE_BITS] + 1
 267     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 268
 269 # Get the tag value from the address
 270 def get_tag(addr):
 271     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 272
 273 # Read a tag from a tag memory row
 274 def read_tag(way, tagset):
 275     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 276
 277 # Read a TLB tag from a TLB tag memory row
 278 def read_tlb_tag(way, tags):
 279     return tags.word_select(way, TLB_EA_TAG_BITS)
 280
 281 # Write a TLB tag to a TLB tag memory row
 282 def write_tlb_tag(way, tags, tag):
 283     return read_tlb_tag(way, tags).eq(tag)
 284
 285 # Read a PTE from a TLB PTE memory row
 286 def read_tlb_pte(way, ptes):
 287     return ptes.word_select(way, TLB_PTE_BITS)
 288
 289 def write_tlb_pte(way, ptes, newpte):
 290     return read_tlb_pte(way, ptes).eq(newpte)
 291
 292
 293 # Record for storing permission, attribute, etc. bits from a PTE
 294 class PermAttr(RecordObject):
 295     def __init__(self, name=None):
 296         super().__init__(name=name)
 297         self.reference = Signal()
 298         self.changed   = Signal()
 299         self.nocache   = Signal()
 300         self.priv      = Signal()
 301         self.rd_perm   = Signal()
 302         self.wr_perm   = Signal()
 303
 304
 305 def extract_perm_attr(pte):
 306     pa = PermAttr()
 307     return pa;
 308
 309
 310 # Type of operation on a "valid" input
 311 @unique
 312 class Op(Enum):
 313     OP_NONE       = 0
 314     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 315     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 316     OP_LOAD_HIT   = 3 # Cache hit on load
 317     OP_LOAD_MISS  = 4 # Load missing cache
 318     OP_LOAD_NC    = 5 # Non-cachable load
 319     OP_STORE_HIT  = 6 # Store hitting cache
 320     OP_STORE_MISS = 7 # Store missing cache
 321
 322
 323 # Cache state machine
 324 @unique
 325 class State(Enum):
 326     IDLE             = 0 # Normal load hit processing
 327     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 328     STORE_WAIT_ACK   = 2 # Store wait ack
 329     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 330
 331
 332 # Dcache operations:
 333 #
 334 # In order to make timing, we use the BRAMs with
 335 # an output buffer, which means that the BRAM
 336 # output is delayed by an extra cycle.
 337 #
 338 # Thus, the dcache has a 2-stage internal pipeline
 339 # for cache hits with no stalls.
 340 #
 341 # All other operations are handled via stalling
 342 # in the first stage.
 343 #
 344 # The second stage can thus complete a hit at the same
 345 # time as the first stage emits a stall for a complex op.
 346 #
 347 # Stage 0 register, basically contains just the latched request
 348
 349 class RegStage0(RecordObject):
 350     def __init__(self, name=None):
 351         super().__init__(name=name)
 352         self.req     = LoadStore1ToDCacheType(name="lsmem")
 353         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 354         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 355         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 356         self.mmu_req = Signal() # indicates source of request
 357         self.d_valid = Signal() # indicates req.data is valid now
 358
 359
 360 class MemAccessRequest(RecordObject):
 361     def __init__(self, name=None):
 362         super().__init__(name=name)
 363         self.op        = Signal(Op)
 364         self.valid     = Signal()
 365         self.dcbz      = Signal()
 366         self.real_addr = Signal(REAL_ADDR_BITS)
 367         self.data      = Signal(64)
 368         self.byte_sel  = Signal(8)
 369         self.hit_way   = Signal(WAY_BITS)
 370         self.same_tag  = Signal()
 371         self.mmu_req   = Signal()
 372
 373
 374 # First stage register, contains state for stage 1 of load hits
 375 # and for the state machine used by all other operations
 376 class RegStage1(RecordObject):
 377     def __init__(self, name=None):
 378         super().__init__(name=name)
 379         # Info about the request
 380         self.full             = Signal() # have uncompleted request
 381         self.mmu_req          = Signal() # request is from MMU
 382         self.req              = MemAccessRequest(name="reqmem")
 383
 384         # Cache hit state
 385         self.hit_way          = Signal(WAY_BITS)
 386         self.hit_load_valid   = Signal()
 387         self.hit_index        = Signal(INDEX_BITS)
 388         self.cache_hit        = Signal()
 389
 390         # TLB hit state
 391         self.tlb_hit          = TLBHit("tlb_hit")
 392         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 393
 394         # 2-stage data buffer for data forwarded from writes to reads
 395         self.forward_data1    = Signal(64)
 396         self.forward_data2    = Signal(64)
 397         self.forward_sel1     = Signal(8)
 398         self.forward_valid1   = Signal()
 399         self.forward_way1     = Signal(WAY_BITS)
 400         self.forward_row1     = Signal(ROW_BITS)
 401         self.use_forward1     = Signal()
 402         self.forward_sel      = Signal(8)
 403
 404         # Cache miss state (reload state machine)
 405         self.state            = Signal(State)
 406         self.dcbz             = Signal()
 407         self.write_bram       = Signal()
 408         self.write_tag        = Signal()
 409         self.slow_valid       = Signal()
 410         self.wb               = WBMasterOut("wb")
 411         self.reload_tag       = Signal(TAG_BITS)
 412         self.store_way        = Signal(WAY_BITS)
 413         self.store_row        = Signal(ROW_BITS)
 414         self.store_index      = Signal(INDEX_BITS)
 415         self.end_row_ix       = Signal(ROW_LINE_BITS)
 416         self.rows_valid       = RowPerLineValidArray()
 417         self.acks_pending     = Signal(3)
 418         self.inc_acks         = Signal()
 419         self.dec_acks         = Signal()
 420
 421         # Signals to complete (possibly with error)
 422         self.ls_valid         = Signal()
 423         self.ls_error         = Signal()
 424         self.mmu_done         = Signal()
 425         self.mmu_error        = Signal()
 426         self.cache_paradox    = Signal()
 427
 428         # Signal to complete a failed stcx.
 429         self.stcx_fail        = Signal()
 430
 431
 432 # Reservation information
 433 class Reservation(RecordObject):
 434     def __init__(self):
 435         super().__init__()
 436         self.valid = Signal()
 437         self.addr  = Signal(64-LINE_OFF_BITS)
 438
 439
 440 class DTLBUpdate(Elaboratable):
 441     def __init__(self):
 442         self.tlbie    = Signal()
 443         self.tlbwe    = Signal()
 444         self.doall    = Signal()
 445         self.tlb_hit     = TLBHit("tlb_hit")
 446         self.tlb_req_index = Signal(TLB_SET_BITS)
 447
 448         self.repl_way        = Signal(TLB_WAY_BITS)
 449         self.eatag           = Signal(TLB_EA_TAG_BITS)
 450         self.pte_data        = Signal(TLB_PTE_BITS)
 451
 452         # read from dtlb array
 453         self.tlb_read       = Signal()
 454         self.tlb_read_index = Signal(TLB_SET_BITS)
 455         self.tlb_way        = TLBRecord("o_tlb_way")
 456
 457     def elaborate(self, platform):
 458         m = Module()
 459         comb = m.d.comb
 460         sync = m.d.sync
 461
 462         # there are 3 parts to this:
 463         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 464         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 465         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 466         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 467         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 468         # hmmm....
 469
 470         dtlb_valid = TLBValidArray()
 471         tlb_req_index = self.tlb_req_index
 472
 473         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 474         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 475         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 476         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 477         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 478         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 479
 480         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 481         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 482         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 483         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 484                                     granularity=TLB_EA_TAG_BITS)
 485
 486         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 487         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 488         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 489                                     granularity=TLB_PTE_BITS)
 490
 491         # commented out for now, can be put in if Memory.reset can be
 492         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 493         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 494         #m.submodules.rd_valid = rd_valid = validm.read_port()
 495         #m.submodules.wr_valid = wr_valid = validm.write_port(
 496                                     #granularity=1)
 497
 498         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 499         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 500         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 501         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 502         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 503         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 504         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 505
 506         updated  = Signal()
 507         v_updated  = Signal()
 508         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 509         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 510         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 511         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 512
 513         comb += dv.eq(dtlb_valid[tlb_req_index])
 514         comb += db_out.eq(dv)
 515
 516         with m.If(self.tlbie & self.doall):
 517             # clear all valid bits at once
 518             # XXX hmmm, validm _could_ use Memory reset here...
 519             for i in range(TLB_SET_SIZE):
 520                 sync += dtlb_valid[i].eq(0)
 521         with m.Elif(self.tlbie):
 522             # invalidate just the hit_way
 523             with m.If(self.tlb_hit.valid):
 524                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 525                 comb += v_updated.eq(1)
 526         with m.Elif(self.tlbwe):
 527             # write to the requested tag and PTE
 528             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 529             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 530             # set valid bit
 531             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 532
 533             comb += updated.eq(1)
 534             comb += v_updated.eq(1)
 535
 536         # above, sometimes valid is requested to be updated but data not
 537         # therefore split them out, here.  note the granularity thing matches
 538         # with the shift-up of the eatag/pte_data into the correct TLB way.
 539         # thus is it not necessary to write the entire lot, just the portion
 540         # being altered: hence writing the *old* copy of the row is not needed
 541         with m.If(updated): # PTE and TAG to be written
 542             comb += wr_pteway.data.eq(pb_out)
 543             comb += wr_pteway.en.eq(1<<self.repl_way)
 544             comb += wr_tagway.data.eq(tb_out)
 545             comb += wr_tagway.en.eq(1<<self.repl_way)
 546         with m.If(v_updated): # Valid to be written
 547             sync += dtlb_valid[tlb_req_index].eq(db_out)
 548             #comb += wr_valid.data.eq(db_out)
 549             #comb += wr_valid.en.eq(1<<self.repl_way)
 550
 551         # select one TLB way, use a register here
 552         r_tlb_way        = TLBRecord("r_tlb_way")
 553         r_delay = Signal()
 554         sync += r_delay.eq(self.tlb_read)
 555         with m.If(self.tlb_read):
 556             sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 557         with m.If(r_delay):
 558             # on one clock delay, output the contents of the read port(s)
 559             # comb += self.tlb_way.valid.eq(rd_valid.data)
 560             comb += self.tlb_way.tag.eq(rd_tagway.data)
 561             comb += self.tlb_way.pte.eq(rd_pteway.data)
 562             # and also capture the (delayed) output...
 563             #sync += r_tlb_way.valid.eq(rd_valid.data)
 564             sync += r_tlb_way.tag.eq(rd_tagway.data)
 565             sync += r_tlb_way.pte.eq(rd_pteway.data)
 566         with m.Else():
 567             # ... so that the register can output it when no read is requested
 568             # it's rather overkill but better to be safe than sorry
 569             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 570             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 571             #comb += self.tlb_way.eq(r_tlb_way)
 572
 573         return m
 574
 575
 576 class DCachePendingHit(Elaboratable):
 577
 578     def __init__(self, tlb_way,
 579                       cache_i_validdx, cache_tag_set,
 580                     req_addr):
 581
 582         self.go          = Signal()
 583         self.virt_mode   = Signal()
 584         self.is_hit      = Signal()
 585         self.tlb_hit      = TLBHit("tlb_hit")
 586         self.hit_way     = Signal(WAY_BITS)
 587         self.rel_match   = Signal()
 588         self.req_index   = Signal(INDEX_BITS)
 589         self.reload_tag  = Signal(TAG_BITS)
 590
 591         self.tlb_way = tlb_way
 592         self.cache_i_validdx = cache_i_validdx
 593         self.cache_tag_set = cache_tag_set
 594         self.req_addr = req_addr
 595
 596     def elaborate(self, platform):
 597         m = Module()
 598         comb = m.d.comb
 599         sync = m.d.sync
 600
 601         go = self.go
 602         virt_mode = self.virt_mode
 603         is_hit = self.is_hit
 604         tlb_way = self.tlb_way
 605         cache_i_validdx = self.cache_i_validdx
 606         cache_tag_set = self.cache_tag_set
 607         req_addr = self.req_addr
 608         tlb_hit = self.tlb_hit
 609         hit_way = self.hit_way
 610         rel_match = self.rel_match
 611         req_index = self.req_index
 612         reload_tag = self.reload_tag
 613
 614         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 615                                   for i in range(TLB_NUM_WAYS))
 616         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 617                                     for i in range(TLB_NUM_WAYS))
 618         hit_way_set = HitWaySet()
 619
 620         # Test if pending request is a hit on any way
 621         # In order to make timing in virtual mode,
 622         # when we are using the TLB, we compare each
 623         # way with each of the real addresses from each way of
 624         # the TLB, and then decide later which match to use.
 625
 626         with m.If(virt_mode):
 627             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 628                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 629                 s_hit       = Signal()
 630                 s_pte       = Signal(TLB_PTE_BITS)
 631                 s_ra        = Signal(REAL_ADDR_BITS)
 632                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 633                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 634                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 635                 comb += s_tag.eq(get_tag(s_ra))
 636
 637                 for i in range(NUM_WAYS): # way_t
 638                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 639                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 640                                   (read_tag(i, cache_tag_set) == s_tag)
 641                                   & (tlb_way.valid[j]))
 642                     with m.If(is_tag_hit):
 643                         comb += hit_way_set[j].eq(i)
 644                         comb += s_hit.eq(1)
 645                 comb += hit_set[j].eq(s_hit)
 646                 with m.If(s_tag == reload_tag):
 647                     comb += rel_matches[j].eq(1)
 648             with m.If(tlb_hit.valid):
 649                 comb += is_hit.eq(hit_set[tlb_hit.way])
 650                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 651                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 652         with m.Else():
 653             s_tag       = Signal(TAG_BITS)
 654             comb += s_tag.eq(get_tag(req_addr))
 655             for i in range(NUM_WAYS): # way_t
 656                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 657                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 658                           (read_tag(i, cache_tag_set) == s_tag))
 659                 with m.If(is_tag_hit):
 660                     comb += hit_way.eq(i)
 661                     comb += is_hit.eq(1)
 662             with m.If(s_tag == reload_tag):
 663                 comb += rel_match.eq(1)
 664
 665         return m
 666
 667
 668 class DCache(Elaboratable):
 669     """Set associative dcache write-through
 670
 671     TODO (in no specific order):
 672     * See list in icache.vhdl
 673     * Complete load misses on the cycle when WB data comes instead of
 674       at the end of line (this requires dealing with requests coming in
 675       while not idle...)
 676     """
 677     def __init__(self):
 678         self.d_in      = LoadStore1ToDCacheType("d_in")
 679         self.d_out     = DCacheToLoadStore1Type("d_out")
 680
 681         self.m_in      = MMUToDCacheType("m_in")
 682         self.m_out     = DCacheToMMUType("m_out")
 683
 684         self.stall_out = Signal()
 685
 686         # standard naming (wired to non-standard for compatibility)
 687         self.bus = Interface(addr_width=32,
 688                             data_width=64,
 689                             granularity=8,
 690                             features={'stall'},
 691                             alignment=0,
 692                             name="dcache")
 693
 694         self.log_out   = Signal(20)
 695
 696     def stage_0(self, m, r0, r1, r0_full):
 697         """Latch the request in r0.req as long as we're not stalling
 698         """
 699         comb = m.d.comb
 700         sync = m.d.sync
 701         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 702
 703         r = RegStage0("stage0")
 704
 705         # TODO, this goes in unit tests and formal proofs
 706         with m.If(d_in.valid & m_in.valid):
 707             sync += Display("request collision loadstore vs MMU")
 708
 709         with m.If(m_in.valid):
 710             comb += r.req.valid.eq(1)
 711             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 712             comb += r.req.dcbz.eq(0)
 713             comb += r.req.nc.eq(0)
 714             comb += r.req.reserve.eq(0)
 715             comb += r.req.virt_mode.eq(0)
 716             comb += r.req.priv_mode.eq(1)
 717             comb += r.req.addr.eq(m_in.addr)
 718             comb += r.req.data.eq(m_in.pte)
 719             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 720             comb += r.tlbie.eq(m_in.tlbie)
 721             comb += r.doall.eq(m_in.doall)
 722             comb += r.tlbld.eq(m_in.tlbld)
 723             comb += r.mmu_req.eq(1)
 724             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 725                                  m_in.addr, m_in.pte, r.req.load)
 726
 727         with m.Else():
 728             comb += r.req.eq(d_in)
 729             comb += r.req.data.eq(0)
 730             comb += r.tlbie.eq(0)
 731             comb += r.doall.eq(0)
 732             comb += r.tlbld.eq(0)
 733             comb += r.mmu_req.eq(0)
 734         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 735             sync += r0.eq(r)
 736             sync += r0_full.eq(r.req.valid)
 737             # Sample data the cycle after a request comes in from loadstore1.
 738             # If another request has come in already then the data will get
 739             # put directly into req.data below.
 740             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 741                      ~r0.mmu_req):
 742                 sync += r0.req.data.eq(d_in.data)
 743                 sync += r0.d_valid.eq(1)
 744         with m.If(d_in.valid):
 745             m.d.sync += Display("    DCACHE req cache "
 746                                 "virt %d addr %x data %x ld %d",
 747                                  r.req.virt_mode, r.req.addr,
 748                                  r.req.data, r.req.load)
 749
 750     def tlb_read(self, m, r0_stall, tlb_way):
 751         """TLB
 752         Operates in the second cycle on the request latched in r0.req.
 753         TLB updates write the entry at the end of the second cycle.
 754         """
 755         comb = m.d.comb
 756         sync = m.d.sync
 757         m_in, d_in = self.m_in, self.d_in
 758
 759         addrbits = Signal(TLB_SET_BITS)
 760
 761         amin = TLB_LG_PGSZ
 762         amax = TLB_LG_PGSZ + TLB_SET_BITS
 763
 764         with m.If(m_in.valid):
 765             comb += addrbits.eq(m_in.addr[amin : amax])
 766         with m.Else():
 767             comb += addrbits.eq(d_in.addr[amin : amax])
 768
 769         # If we have any op and the previous op isn't finished,
 770         # then keep the same output for next cycle.
 771         d = self.dtlb_update
 772         comb += d.tlb_read_index.eq(addrbits)
 773         comb += d.tlb_read.eq(~r0_stall)
 774         comb += tlb_way.eq(d.tlb_way)
 775
 776     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 777         """Generate TLB PLRUs
 778         """
 779         comb = m.d.comb
 780         sync = m.d.sync
 781
 782         if TLB_NUM_WAYS == 0:
 783             return
 784
 785         # suite of PLRUs with a selection and output mechanism
 786         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 787         m.submodules.tlb_plrus = tlb_plrus
 788         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 789         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 790         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 791         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 792         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 793
 794     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 795                    tlb_way,
 796                    pte, tlb_hit, valid_ra, perm_attr, ra):
 797
 798         comb = m.d.comb
 799
 800         hitway = Signal(TLB_WAY_BITS)
 801         hit    = Signal()
 802         eatag  = Signal(TLB_EA_TAG_BITS)
 803
 804         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 805         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 806         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 807
 808         for i in range(TLB_NUM_WAYS):
 809             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 810             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 811             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 812             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 813             with m.If(is_tag_hit):
 814                 comb += hitway.eq(i)
 815                 comb += hit.eq(1)
 816
 817         comb += tlb_hit.valid.eq(hit & r0_valid)
 818         comb += tlb_hit.way.eq(hitway)
 819
 820         with m.If(tlb_hit.valid):
 821             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 822         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 823
 824         with m.If(r0.req.virt_mode):
 825             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 826                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 827                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 828             comb += perm_attr.reference.eq(pte[8])
 829             comb += perm_attr.changed.eq(pte[7])
 830             comb += perm_attr.nocache.eq(pte[5])
 831             comb += perm_attr.priv.eq(pte[3])
 832             comb += perm_attr.rd_perm.eq(pte[2])
 833             comb += perm_attr.wr_perm.eq(pte[1])
 834         with m.Else():
 835             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 836                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 837             comb += perm_attr.reference.eq(1)
 838             comb += perm_attr.changed.eq(1)
 839             comb += perm_attr.nocache.eq(0)
 840             comb += perm_attr.priv.eq(1)
 841             comb += perm_attr.rd_perm.eq(1)
 842             comb += perm_attr.wr_perm.eq(1)
 843
 844         with m.If(valid_ra):
 845             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 846                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 847             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 848             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 849             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 850             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 851             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 852             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 853
 854     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 855                     tlb_hit, tlb_plru_victim):
 856
 857         comb = m.d.comb
 858         sync = m.d.sync
 859
 860         tlbie    = Signal()
 861         tlbwe    = Signal()
 862
 863         comb += tlbie.eq(r0_valid & r0.tlbie)
 864         comb += tlbwe.eq(r0_valid & r0.tlbld)
 865
 866         d = self.dtlb_update
 867
 868         comb += d.tlbie.eq(tlbie)
 869         comb += d.tlbwe.eq(tlbwe)
 870         comb += d.doall.eq(r0.doall)
 871         comb += d.tlb_hit.eq(tlb_hit)
 872         comb += d.tlb_req_index.eq(tlb_req_index)
 873
 874         with m.If(tlb_hit.valid):
 875             comb += d.repl_way.eq(tlb_hit.way)
 876         with m.Else():
 877             comb += d.repl_way.eq(tlb_plru_victim)
 878         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 879         comb += d.pte_data.eq(r0.req.data)
 880
 881     def maybe_plrus(self, m, r1, plru_victim):
 882         """Generate PLRUs
 883         """
 884         comb = m.d.comb
 885         sync = m.d.sync
 886
 887         if TLB_NUM_WAYS == 0:
 888             return
 889
 890         # suite of PLRUs with a selection and output mechanism
 891         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 892         comb += plrus.way.eq(r1.hit_way)
 893         comb += plrus.valid.eq(r1.cache_hit)
 894         comb += plrus.index.eq(r1.hit_index)
 895         comb += plrus.isel.eq(r1.store_index) # select victim
 896         comb += plru_victim.eq(plrus.o_index) # selected victim
 897
 898     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 899         """Cache tag RAM read port
 900         """
 901         comb = m.d.comb
 902         sync = m.d.sync
 903         m_in, d_in = self.m_in, self.d_in
 904
 905         index = Signal(INDEX_BITS)
 906
 907         with m.If(r0_stall):
 908             comb += index.eq(req_index)
 909         with m.Elif(m_in.valid):
 910             comb += index.eq(get_index(m_in.addr))
 911         with m.Else():
 912             comb += index.eq(get_index(d_in.addr))
 913         sync += cache_tag_set.eq(cache_tags[index].tag)
 914
 915     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 916                        r0_valid, r1, cache_tags, replace_way,
 917                        use_forward1_next, use_forward2_next,
 918                        req_hit_way, plru_victim, rc_ok, perm_attr,
 919                        valid_ra, perm_ok, access_ok, req_op, req_go,
 920                        tlb_hit, tlb_way, cache_tag_set,
 921                        cancel_store, req_same_tag, r0_stall, early_req_row):
 922         """Cache request parsing and hit detection
 923         """
 924
 925         comb = m.d.comb
 926         m_in, d_in = self.m_in, self.d_in
 927
 928         is_hit      = Signal()
 929         hit_way     = Signal(WAY_BITS)
 930         op          = Signal(Op)
 931         opsel       = Signal(3)
 932         go          = Signal()
 933         nc          = Signal()
 934         cache_i_validdx = Signal(NUM_WAYS)
 935
 936         # Extract line, row and tag from request
 937         comb += req_index.eq(get_index(r0.req.addr))
 938         comb += req_row.eq(get_row(r0.req.addr))
 939         comb += req_tag.eq(get_tag(ra))
 940
 941         if False: # display on comb is a bit... busy.
 942             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 943                     r0.req.addr, ra, req_index, req_tag, req_row)
 944
 945         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 946         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 947
 948         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 949                                             cache_i_validdx, cache_tag_set,
 950                                             r0.req.addr)
 951         comb += dc.tlb_hit.eq(tlb_hit)
 952         comb += dc.reload_tag.eq(r1.reload_tag)
 953         comb += dc.virt_mode.eq(r0.req.virt_mode)
 954         comb += dc.go.eq(go)
 955         comb += dc.req_index.eq(req_index)
 956
 957         comb += is_hit.eq(dc.is_hit)
 958         comb += hit_way.eq(dc.hit_way)
 959         comb += req_same_tag.eq(dc.rel_match)
 960
 961         # See if the request matches the line currently being reloaded
 962         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 963                   (req_index == r1.store_index) & req_same_tag):
 964             # For a store, consider this a hit even if the row isn't
 965             # valid since it will be by the time we perform the store.
 966             # For a load, check the appropriate row valid bit.
 967             rrow = Signal(ROW_LINE_BITS)
 968             comb += rrow.eq(req_row)
 969             valid = r1.rows_valid[rrow]
 970             comb += is_hit.eq((~r0.req.load) | valid)
 971             comb += hit_way.eq(replace_way)
 972
 973         # Whether to use forwarded data for a load or not
 974         with m.If((get_row(r1.req.real_addr) == req_row) &
 975                   (r1.req.hit_way == hit_way)):
 976             # Only need to consider r1.write_bram here, since if we
 977             # are writing refill data here, then we don't have a
 978             # cache hit this cycle on the line being refilled.
 979             # (There is the possibility that the load following the
 980             # load miss that started the refill could be to the old
 981             # contents of the victim line, since it is a couple of
 982             # cycles after the refill starts before we see the updated
 983             # cache tag. In that case we don't use the bypass.)
 984             comb += use_forward1_next.eq(r1.write_bram)
 985         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 986             comb += use_forward2_next.eq(r1.forward_valid1)
 987
 988         # The way that matched on a hit
 989         comb += req_hit_way.eq(hit_way)
 990
 991         # The way to replace on a miss
 992         with m.If(r1.write_tag):
 993             comb += replace_way.eq(plru_victim)
 994         with m.Else():
 995             comb += replace_way.eq(r1.store_way)
 996
 997         # work out whether we have permission for this access
 998         # NB we don't yet implement AMR, thus no KUAP
 999         comb += rc_ok.eq(perm_attr.reference
1000                          & (r0.req.load | perm_attr.changed))
1001         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1002                            (perm_attr.wr_perm |
1003                               (r0.req.load & perm_attr.rd_perm)))
1004         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1005
1006         # Combine the request and cache hit status to decide what
1007         # operation needs to be done
1008         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1009         comb += op.eq(Op.OP_NONE)
1010         with m.If(go):
1011             with m.If(~access_ok):
1012                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1013                                  valid_ra, perm_ok, rc_ok)
1014                 comb += op.eq(Op.OP_BAD)
1015             with m.Elif(cancel_store):
1016                 m.d.sync += Display("DCACHE cancel store")
1017                 comb += op.eq(Op.OP_STCX_FAIL)
1018             with m.Else():
1019                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1020                                  valid_ra, nc, r0.req.load)
1021                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1022                 with m.Switch(opsel):
1023                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1024                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1025                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1026                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1027                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1028                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1029                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1030                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1031         comb += req_op.eq(op)
1032         comb += req_go.eq(go)
1033
1034         # Version of the row number that is valid one cycle earlier
1035         # in the cases where we need to read the cache data BRAM.
1036         # If we're stalling then we need to keep reading the last
1037         # row requested.
1038         with m.If(~r0_stall):
1039             with m.If(m_in.valid):
1040                 comb += early_req_row.eq(get_row(m_in.addr))
1041             with m.Else():
1042                 comb += early_req_row.eq(get_row(d_in.addr))
1043         with m.Else():
1044             comb += early_req_row.eq(req_row)
1045
1046     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1047                          r0_valid, r0, reservation):
1048         """Handle load-with-reservation and store-conditional instructions
1049         """
1050         comb = m.d.comb
1051
1052         with m.If(r0_valid & r0.req.reserve):
1053             # XXX generate alignment interrupt if address
1054             # is not aligned XXX or if r0.req.nc = '1'
1055             with m.If(r0.req.load):
1056                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1057             with m.Else():
1058                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1059                 with m.If((~reservation.valid) |
1060                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1061                     comb += cancel_store.eq(1)
1062
1063     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1064                         reservation, r0):
1065         comb = m.d.comb
1066         sync = m.d.sync
1067
1068         with m.If(r0_valid & access_ok):
1069             with m.If(clear_rsrv):
1070                 sync += reservation.valid.eq(0)
1071             with m.Elif(set_rsrv):
1072                 sync += reservation.valid.eq(1)
1073                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1074
1075     def writeback_control(self, m, r1, cache_out_row):
1076         """Return data for loads & completion control logic
1077         """
1078         comb = m.d.comb
1079         sync = m.d.sync
1080         d_out, m_out = self.d_out, self.m_out
1081
1082         data_out = Signal(64)
1083         data_fwd = Signal(64)
1084
1085         # Use the bypass if are reading the row that was
1086         # written 1 or 2 cycles ago, including for the
1087         # slow_valid = 1 case (i.e. completing a load
1088         # miss or a non-cacheable load).
1089         with m.If(r1.use_forward1):
1090             comb += data_fwd.eq(r1.forward_data1)
1091         with m.Else():
1092             comb += data_fwd.eq(r1.forward_data2)
1093
1094         comb += data_out.eq(cache_out_row)
1095
1096         for i in range(8):
1097             with m.If(r1.forward_sel[i]):
1098                 dsel = data_fwd.word_select(i, 8)
1099                 comb += data_out.word_select(i, 8).eq(dsel)
1100
1101         # DCache output to LoadStore
1102         comb += d_out.valid.eq(r1.ls_valid)
1103         comb += d_out.data.eq(data_out)
1104         comb += d_out.store_done.eq(~r1.stcx_fail)
1105         comb += d_out.error.eq(r1.ls_error)
1106         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1107
1108         # Outputs to MMU
1109         comb += m_out.done.eq(r1.mmu_done)
1110         comb += m_out.err.eq(r1.mmu_error)
1111         comb += m_out.data.eq(data_out)
1112
1113         # We have a valid load or store hit or we just completed
1114         # a slow op such as a load miss, a NC load or a store
1115         #
1116         # Note: the load hit is delayed by one cycle. However it
1117         # can still not collide with r.slow_valid (well unless I
1118         # miscalculated) because slow_valid can only be set on a
1119         # subsequent request and not on its first cycle (the state
1120         # machine must have advanced), which makes slow_valid
1121         # at least 2 cycles from the previous hit_load_valid.
1122
1123         # Sanity: Only one of these must be set in any given cycle
1124
1125         if False: # TODO: need Display to get this to work
1126             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1127             "unexpected slow_valid collision with stcx_fail"
1128
1129             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1130              "unexpected hit_load_delayed collision with slow_valid"
1131
1132         with m.If(~r1.mmu_req):
1133             # Request came from loadstore1...
1134             # Load hit case is the standard path
1135             with m.If(r1.hit_load_valid):
1136                 sync += Display("completing load hit data=%x", data_out)
1137
1138             # error cases complete without stalling
1139             with m.If(r1.ls_error):
1140                 with m.If(r1.dcbz):
1141                     sync += Display("completing dcbz with error")
1142                 with m.Else():
1143                     sync += Display("completing ld/st with error")
1144
1145             # Slow ops (load miss, NC, stores)
1146             with m.If(r1.slow_valid):
1147                 sync += Display("completing store or load miss adr=%x data=%x",
1148                                 r1.req.real_addr, data_out)
1149
1150         with m.Else():
1151             # Request came from MMU
1152             with m.If(r1.hit_load_valid):
1153                 sync += Display("completing load hit to MMU, data=%x",
1154                                 m_out.data)
1155             # error cases complete without stalling
1156             with m.If(r1.mmu_error):
1157                 sync += Display("combpleting MMU ld with error")
1158
1159             # Slow ops (i.e. load miss)
1160             with m.If(r1.slow_valid):
1161                 sync += Display("completing MMU load miss, adr=%x data=%x",
1162                                 r1.req.real_addr, m_out.data)
1163
1164     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1165         """rams
1166         Generate a cache RAM for each way. This handles the normal
1167         reads, writes from reloads and the special store-hit update
1168         path as well.
1169
1170         Note: the BRAMs have an extra read buffer, meaning the output
1171         is pipelined an extra cycle. This differs from the
1172         icache. The writeback logic needs to take that into
1173         account by using 1-cycle delayed signals for load hits.
1174         """
1175         comb = m.d.comb
1176         bus = self.bus
1177
1178         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1179         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1180         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1181         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1182                    ~r1.write_bram))
1183         comb += rwe.i.eq(replace_way)
1184
1185         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1186         comb += hwe.i.eq(r1.hit_way)
1187
1188         # this one is gated with write_bram, and replace_way_e can never be
1189         # set at the same time.  that means that do_write can OR the outputs
1190         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1191         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1192         comb += hre.i.eq(r1.req.hit_way)
1193
1194         # common Signals
1195         do_read  = Signal()
1196         wr_addr  = Signal(ROW_BITS)
1197         wr_data  = Signal(WB_DATA_BITS)
1198         wr_sel   = Signal(ROW_SIZE)
1199         rd_addr  = Signal(ROW_BITS)
1200
1201         comb += do_read.eq(1) # always enable
1202         comb += rd_addr.eq(early_req_row)
1203
1204         # Write mux:
1205         #
1206         # Defaults to wishbone read responses (cache refill)
1207         #
1208         # For timing, the mux on wr_data/sel/addr is not
1209         # dependent on anything other than the current state.
1210
1211         with m.If(r1.write_bram):
1212             # Write store data to BRAM.  This happens one
1213             # cycle after the store is in r0.
1214             comb += wr_data.eq(r1.req.data)
1215             comb += wr_sel.eq(r1.req.byte_sel)
1216             comb += wr_addr.eq(get_row(r1.req.real_addr))
1217
1218         with m.Else():
1219             # Otherwise, we might be doing a reload or a DCBZ
1220             with m.If(r1.dcbz):
1221                 comb += wr_data.eq(0)
1222             with m.Else():
1223                 comb += wr_data.eq(bus.dat_r)
1224             comb += wr_addr.eq(r1.store_row)
1225             comb += wr_sel.eq(~0) # all 1s
1226
1227         # set up Cache Rams
1228         for i in range(NUM_WAYS):
1229             do_write = Signal(name="do_wr%d" % i)
1230             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1231             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1232
1233             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1234             setattr(m.submodules, "cacheram_%d" % i, way)
1235
1236             comb += way.rd_en.eq(do_read)
1237             comb += way.rd_addr.eq(rd_addr)
1238             comb += d_out.eq(way.rd_data_o)
1239             comb += way.wr_sel.eq(wr_sel_m)
1240             comb += way.wr_addr.eq(wr_addr)
1241             comb += way.wr_data.eq(wr_data)
1242
1243             # Cache hit reads
1244             with m.If(hwe.o[i]):
1245                 comb += cache_out_row.eq(d_out)
1246
1247             # these are mutually-exclusive via their Decoder-enablers
1248             # (note: Decoder-enable is inverted)
1249             comb += do_write.eq(hre.o[i] | rwe.o[i])
1250
1251             # Mask write selects with do_write since BRAM
1252             # doesn't have a global write-enable
1253             with m.If(do_write):
1254                 comb += wr_sel_m.eq(wr_sel)
1255
1256     # Cache hit synchronous machine for the easy case.
1257     # This handles load hits.
1258     # It also handles error cases (TLB miss, cache paradox)
1259     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1260                         req_hit_way, req_index, req_tag, access_ok,
1261                         tlb_hit, tlb_req_index):
1262         comb = m.d.comb
1263         sync = m.d.sync
1264
1265         with m.If(req_op != Op.OP_NONE):
1266             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1267                     req_op, r0.req.addr, r0.req.nc,
1268                     req_index, req_tag, req_hit_way)
1269
1270         with m.If(r0_valid):
1271             sync += r1.mmu_req.eq(r0.mmu_req)
1272
1273         # Fast path for load/store hits.
1274         # Set signals for the writeback controls.
1275         sync += r1.hit_way.eq(req_hit_way)
1276         sync += r1.hit_index.eq(req_index)
1277
1278         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1279         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1280                                 (req_op == Op.OP_STORE_HIT))
1281
1282         with m.If(req_op == Op.OP_BAD):
1283             sync += Display("Signalling ld/st error "
1284                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1285                             ~r0.mmu_req,r0.mmu_req,access_ok)
1286             sync += r1.ls_error.eq(~r0.mmu_req)
1287             sync += r1.mmu_error.eq(r0.mmu_req)
1288             sync += r1.cache_paradox.eq(access_ok)
1289         with m.Else():
1290             sync += r1.ls_error.eq(0)
1291             sync += r1.mmu_error.eq(0)
1292             sync += r1.cache_paradox.eq(0)
1293
1294         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1295
1296         # Record TLB hit information for updating TLB PLRU
1297         sync += r1.tlb_hit.eq(tlb_hit)
1298         sync += r1.tlb_hit_index.eq(tlb_req_index)
1299
1300     # Memory accesses are handled by this state machine:
1301     #
1302     #   * Cache load miss/reload (in conjunction with "rams")
1303     #   * Load hits for non-cachable forms
1304     #   * Stores (the collision case is handled in "rams")
1305     #
1306     # All wishbone requests generation is done here.
1307     # This machine operates at stage 1.
1308     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1309                     r0, replace_way,
1310                     req_hit_way, req_same_tag,
1311                     r0_valid, req_op, cache_tags, req_go, ra):
1312
1313         comb = m.d.comb
1314         sync = m.d.sync
1315         bus = self.bus
1316         d_in = self.d_in
1317
1318         req         = MemAccessRequest("mreq_ds")
1319
1320         req_row = Signal(ROW_BITS)
1321         req_idx = Signal(INDEX_BITS)
1322         req_tag = Signal(TAG_BITS)
1323         comb += req_idx.eq(get_index(req.real_addr))
1324         comb += req_row.eq(get_row(req.real_addr))
1325         comb += req_tag.eq(get_tag(req.real_addr))
1326
1327         sync += r1.use_forward1.eq(use_forward1_next)
1328         sync += r1.forward_sel.eq(0)
1329
1330         with m.If(use_forward1_next):
1331             sync += r1.forward_sel.eq(r1.req.byte_sel)
1332         with m.Elif(use_forward2_next):
1333             sync += r1.forward_sel.eq(r1.forward_sel1)
1334
1335         sync += r1.forward_data2.eq(r1.forward_data1)
1336         with m.If(r1.write_bram):
1337             sync += r1.forward_data1.eq(r1.req.data)
1338             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1339             sync += r1.forward_way1.eq(r1.req.hit_way)
1340             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1341             sync += r1.forward_valid1.eq(1)
1342         with m.Else():
1343             with m.If(r1.dcbz):
1344                 sync += r1.forward_data1.eq(0)
1345             with m.Else():
1346                 sync += r1.forward_data1.eq(bus.dat_r)
1347             sync += r1.forward_sel1.eq(~0) # all 1s
1348             sync += r1.forward_way1.eq(replace_way)
1349             sync += r1.forward_row1.eq(r1.store_row)
1350             sync += r1.forward_valid1.eq(0)
1351
1352         # One cycle pulses reset
1353         sync += r1.slow_valid.eq(0)
1354         sync += r1.write_bram.eq(0)
1355         sync += r1.inc_acks.eq(0)
1356         sync += r1.dec_acks.eq(0)
1357
1358         sync += r1.ls_valid.eq(0)
1359         # complete tlbies and TLB loads in the third cycle
1360         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1361
1362         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1363             with m.If(~r0.mmu_req):
1364                 sync += r1.ls_valid.eq(1)
1365             with m.Else():
1366                 sync += r1.mmu_done.eq(1)
1367
1368         with m.If(r1.write_tag):
1369             # Store new tag in selected way
1370             replace_way_onehot = Signal(NUM_WAYS)
1371             comb += replace_way_onehot.eq(1<<replace_way)
1372             for i in range(NUM_WAYS):
1373                 with m.If(replace_way_onehot[i]):
1374                     ct = Signal(TAG_RAM_WIDTH)
1375                     comb += ct.eq(cache_tags[r1.store_index].tag)
1376                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1377                     sync += cache_tags[r1.store_index].tag.eq(ct)
1378             sync += r1.store_way.eq(replace_way)
1379             sync += r1.write_tag.eq(0)
1380
1381         # Take request from r1.req if there is one there,
1382         # else from req_op, ra, etc.
1383         with m.If(r1.full):
1384             comb += req.eq(r1.req)
1385         with m.Else():
1386             comb += req.op.eq(req_op)
1387             comb += req.valid.eq(req_go)
1388             comb += req.mmu_req.eq(r0.mmu_req)
1389             comb += req.dcbz.eq(r0.req.dcbz)
1390             comb += req.real_addr.eq(ra)
1391
1392             with m.If(r0.req.dcbz):
1393                 # force data to 0 for dcbz
1394                 comb += req.data.eq(0)
1395             with m.Elif(r0.d_valid):
1396                 comb += req.data.eq(r0.req.data)
1397             with m.Else():
1398                 comb += req.data.eq(d_in.data)
1399
1400             # Select all bytes for dcbz
1401             # and for cacheable loads
1402             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1403                 comb += req.byte_sel.eq(~0) # all 1s
1404             with m.Else():
1405                 comb += req.byte_sel.eq(r0.req.byte_sel)
1406             comb += req.hit_way.eq(req_hit_way)
1407             comb += req.same_tag.eq(req_same_tag)
1408
1409             # Store the incoming request from r0,
1410             # if it is a slow request
1411             # Note that r1.full = 1 implies req_op = OP_NONE
1412             with m.If((req_op == Op.OP_LOAD_MISS)
1413                       | (req_op == Op.OP_LOAD_NC)
1414                       | (req_op == Op.OP_STORE_MISS)
1415                       | (req_op == Op.OP_STORE_HIT)):
1416                 sync += r1.req.eq(req)
1417                 sync += r1.full.eq(1)
1418
1419         # Main state machine
1420         with m.Switch(r1.state):
1421
1422             with m.Case(State.IDLE):
1423                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1424                 sync += r1.wb.sel.eq(req.byte_sel)
1425                 sync += r1.wb.dat.eq(req.data)
1426                 sync += r1.dcbz.eq(req.dcbz)
1427
1428                 # Keep track of our index and way
1429                 # for subsequent stores.
1430                 sync += r1.store_index.eq(req_idx)
1431                 sync += r1.store_row.eq(req_row)
1432                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1433                 sync += r1.reload_tag.eq(req_tag)
1434                 sync += r1.req.same_tag.eq(1)
1435
1436                 with m.If(req.op == Op.OP_STORE_HIT):
1437                     sync += r1.store_way.eq(req.hit_way)
1438
1439                 # Reset per-row valid bits,
1440                 # ready for handling OP_LOAD_MISS
1441                 for i in range(ROW_PER_LINE):
1442                     sync += r1.rows_valid[i].eq(0)
1443
1444                 with m.If(req_op != Op.OP_NONE):
1445                     sync += Display("cache op %d", req.op)
1446
1447                 with m.Switch(req.op):
1448                     with m.Case(Op.OP_LOAD_HIT):
1449                         # stay in IDLE state
1450                         pass
1451
1452                     with m.Case(Op.OP_LOAD_MISS):
1453                         sync += Display("cache miss real addr: %x " \
1454                                 "idx: %x tag: %x",
1455                                 req.real_addr, req_row, req_tag)
1456
1457                         # Start the wishbone cycle
1458                         sync += r1.wb.we.eq(0)
1459                         sync += r1.wb.cyc.eq(1)
1460                         sync += r1.wb.stb.eq(1)
1461
1462                         # Track that we had one request sent
1463                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1464                         sync += r1.write_tag.eq(1)
1465
1466                     with m.Case(Op.OP_LOAD_NC):
1467                         sync += r1.wb.cyc.eq(1)
1468                         sync += r1.wb.stb.eq(1)
1469                         sync += r1.wb.we.eq(0)
1470                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1471
1472                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1473                         with m.If(~req.dcbz):
1474                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1475                             sync += r1.acks_pending.eq(1)
1476                             sync += r1.full.eq(0)
1477                             sync += r1.slow_valid.eq(1)
1478
1479                             with m.If(~req.mmu_req):
1480                                 sync += r1.ls_valid.eq(1)
1481                             with m.Else():
1482                                 sync += r1.mmu_done.eq(1)
1483
1484                             with m.If(req.op == Op.OP_STORE_HIT):
1485                                 sync += r1.write_bram.eq(1)
1486                         with m.Else():
1487                             # dcbz is handled much like a load miss except
1488                             # that we are writing to memory instead of reading
1489                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1490
1491                             with m.If(req.op == Op.OP_STORE_MISS):
1492                                 sync += r1.write_tag.eq(1)
1493
1494                         sync += r1.wb.we.eq(1)
1495                         sync += r1.wb.cyc.eq(1)
1496                         sync += r1.wb.stb.eq(1)
1497
1498                     # OP_NONE and OP_BAD do nothing
1499                     # OP_BAD & OP_STCX_FAIL were
1500                     # handled above already
1501                     with m.Case(Op.OP_NONE):
1502                         pass
1503                     with m.Case(Op.OP_BAD):
1504                         pass
1505                     with m.Case(Op.OP_STCX_FAIL):
1506                         pass
1507
1508             with m.Case(State.RELOAD_WAIT_ACK):
1509                 ld_stbs_done = Signal()
1510                 # Requests are all sent if stb is 0
1511                 comb += ld_stbs_done.eq(~r1.wb.stb)
1512
1513                 # If we are still sending requests, was one accepted?
1514                 with m.If((~bus.stall) & r1.wb.stb):
1515                     # That was the last word?  We are done sending.
1516                     # Clear stb and set ld_stbs_done so we can handle an
1517                     # eventual last ack on the same cycle.
1518                     # sigh - reconstruct wb adr with 3 extra 0s at front
1519                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1520                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1521                         sync += r1.wb.stb.eq(0)
1522                         comb += ld_stbs_done.eq(1)
1523
1524                     # Calculate the next row address in the current cache line
1525                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1526                     comb += row.eq(r1.wb.adr)
1527                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1528
1529                 # Incoming acks processing
1530                 sync += r1.forward_valid1.eq(bus.ack)
1531                 with m.If(bus.ack):
1532                     srow = Signal(ROW_LINE_BITS)
1533                     comb += srow.eq(r1.store_row)
1534                     sync += r1.rows_valid[srow].eq(1)
1535
1536                     # If this is the data we were looking for,
1537                     # we can complete the request next cycle.
1538                     # Compare the whole address in case the
1539                     # request in r1.req is not the one that
1540                     # started this refill.
1541                     with m.If(req.valid & r1.req.same_tag &
1542                               ((r1.dcbz & r1.req.dcbz) |
1543                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1544                                 (r1.store_row == get_row(req.real_addr))):
1545                         sync += r1.full.eq(0)
1546                         sync += r1.slow_valid.eq(1)
1547                         with m.If(~r1.mmu_req):
1548                             sync += r1.ls_valid.eq(1)
1549                         with m.Else():
1550                             sync += r1.mmu_done.eq(1)
1551                         sync += r1.forward_sel.eq(~0) # all 1s
1552                         sync += r1.use_forward1.eq(1)
1553
1554                     # Check for completion
1555                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1556                                                       r1.end_row_ix)):
1557                         # Complete wishbone cycle
1558                         sync += r1.wb.cyc.eq(0)
1559
1560                         # Cache line is now valid
1561                         cv = Signal(INDEX_BITS)
1562                         comb += cv.eq(cache_tags[r1.store_index].valid)
1563                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1564                         sync += cache_tags[r1.store_index].valid.eq(cv)
1565
1566                         sync += r1.state.eq(State.IDLE)
1567                         sync += Display("cache valid set %x "
1568                                         "idx %d way %d",
1569                                          cv, r1.store_index, r1.store_way)
1570
1571                     # Increment store row counter
1572                     sync += r1.store_row.eq(next_row(r1.store_row))
1573
1574             with m.Case(State.STORE_WAIT_ACK):
1575                 st_stbs_done = Signal()
1576                 acks        = Signal(3)
1577                 adjust_acks = Signal(3)
1578
1579                 comb += st_stbs_done.eq(~r1.wb.stb)
1580                 comb += acks.eq(r1.acks_pending)
1581
1582                 with m.If(r1.inc_acks != r1.dec_acks):
1583                     with m.If(r1.inc_acks):
1584                         comb += adjust_acks.eq(acks + 1)
1585                     with m.Else():
1586                         comb += adjust_acks.eq(acks - 1)
1587                 with m.Else():
1588                     comb += adjust_acks.eq(acks)
1589
1590                 sync += r1.acks_pending.eq(adjust_acks)
1591
1592                 # Clear stb when slave accepted request
1593                 with m.If(~bus.stall):
1594                     # See if there is another store waiting
1595                     # to be done which is in the same real page.
1596                     with m.If(req.valid):
1597                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1598                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1599                         sync += r1.wb.dat.eq(req.data)
1600                         sync += r1.wb.sel.eq(req.byte_sel)
1601
1602                     with m.If((adjust_acks < 7) & req.same_tag &
1603                                 ((req.op == Op.OP_STORE_MISS)
1604                                  | (req.op == Op.OP_STORE_HIT))):
1605                         sync += r1.wb.stb.eq(1)
1606                         comb += st_stbs_done.eq(0)
1607
1608                         with m.If(req.op == Op.OP_STORE_HIT):
1609                             sync += r1.write_bram.eq(1)
1610                         sync += r1.full.eq(0)
1611                         sync += r1.slow_valid.eq(1)
1612
1613                         # Store requests never come from the MMU
1614                         sync += r1.ls_valid.eq(1)
1615                         comb += st_stbs_done.eq(0)
1616                         sync += r1.inc_acks.eq(1)
1617                     with m.Else():
1618                         sync += r1.wb.stb.eq(0)
1619                         comb += st_stbs_done.eq(1)
1620
1621                 # Got ack ? See if complete.
1622                 with m.If(bus.ack):
1623                     with m.If(st_stbs_done & (adjust_acks == 1)):
1624                         sync += r1.state.eq(State.IDLE)
1625                         sync += r1.wb.cyc.eq(0)
1626                         sync += r1.wb.stb.eq(0)
1627                     sync += r1.dec_acks.eq(1)
1628
1629             with m.Case(State.NC_LOAD_WAIT_ACK):
1630                 # Clear stb when slave accepted request
1631                 with m.If(~bus.stall):
1632                     sync += r1.wb.stb.eq(0)
1633
1634                 # Got ack ? complete.
1635                 with m.If(bus.ack):
1636                     sync += r1.state.eq(State.IDLE)
1637                     sync += r1.full.eq(0)
1638                     sync += r1.slow_valid.eq(1)
1639
1640                     with m.If(~r1.mmu_req):
1641                         sync += r1.ls_valid.eq(1)
1642                     with m.Else():
1643                         sync += r1.mmu_done.eq(1)
1644
1645                     sync += r1.forward_sel.eq(~0) # all 1s
1646                     sync += r1.use_forward1.eq(1)
1647                     sync += r1.wb.cyc.eq(0)
1648                     sync += r1.wb.stb.eq(0)
1649
1650     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1651
1652         sync = m.d.sync
1653         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1654
1655         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1656                                stall_out, req_op[:3], d_out.valid, d_out.error,
1657                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1658                                r1.real_adr[3:6]))
1659
1660     def elaborate(self, platform):
1661
1662         m = Module()
1663         comb = m.d.comb
1664         d_in = self.d_in
1665
1666         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1667         cache_tags       = CacheTagArray()
1668         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1669
1670         # TODO attribute ram_style : string;
1671         # TODO attribute ram_style of cache_tags : signal is "distributed";
1672
1673         """note: these are passed to nmigen.hdl.Memory as "attributes".
1674            don't know how, just that they are.
1675         """
1676         # TODO attribute ram_style of
1677         #  dtlb_tags : signal is "distributed";
1678         # TODO attribute ram_style of
1679         #  dtlb_ptes : signal is "distributed";
1680
1681         r0      = RegStage0("r0")
1682         r0_full = Signal()
1683
1684         r1 = RegStage1("r1")
1685
1686         reservation = Reservation()
1687
1688         # Async signals on incoming request
1689         req_index    = Signal(INDEX_BITS)
1690         req_row      = Signal(ROW_BITS)
1691         req_hit_way  = Signal(WAY_BITS)
1692         req_tag      = Signal(TAG_BITS)
1693         req_op       = Signal(Op)
1694         req_data     = Signal(64)
1695         req_same_tag = Signal()
1696         req_go       = Signal()
1697
1698         early_req_row     = Signal(ROW_BITS)
1699
1700         cancel_store      = Signal()
1701         set_rsrv          = Signal()
1702         clear_rsrv        = Signal()
1703
1704         r0_valid          = Signal()
1705         r0_stall          = Signal()
1706
1707         use_forward1_next = Signal()
1708         use_forward2_next = Signal()
1709
1710         cache_out_row     = Signal(WB_DATA_BITS)
1711
1712         plru_victim       = Signal(WAY_BITS)
1713         replace_way       = Signal(WAY_BITS)
1714
1715         # Wishbone read/write/cache write formatting signals
1716         bus_sel           = Signal(8)
1717
1718         # TLB signals
1719         tlb_way       = TLBRecord("tlb_way")
1720         tlb_req_index = Signal(TLB_SET_BITS)
1721         tlb_hit       = TLBHit("tlb_hit")
1722         pte           = Signal(TLB_PTE_BITS)
1723         ra            = Signal(REAL_ADDR_BITS)
1724         valid_ra      = Signal()
1725         perm_attr     = PermAttr("dc_perms")
1726         rc_ok         = Signal()
1727         perm_ok       = Signal()
1728         access_ok     = Signal()
1729
1730         tlb_plru_victim = Signal(TLB_WAY_BITS)
1731
1732         # we don't yet handle collisions between loadstore1 requests
1733         # and MMU requests
1734         comb += self.m_out.stall.eq(0)
1735
1736         # Hold off the request in r0 when r1 has an uncompleted request
1737         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1738         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1739         comb += self.stall_out.eq(r0_stall)
1740
1741         # deal with litex not doing wishbone pipeline mode
1742         # XXX in wrong way.  FIFOs are needed in the SRAM test
1743         # so that stb/ack match up. same thing done in icache.py
1744         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1745
1746         # Wire up wishbone request latch out of stage 1
1747         comb += self.bus.we.eq(r1.wb.we)
1748         comb += self.bus.adr.eq(r1.wb.adr)
1749         comb += self.bus.sel.eq(r1.wb.sel)
1750         comb += self.bus.stb.eq(r1.wb.stb)
1751         comb += self.bus.dat_w.eq(r1.wb.dat)
1752         comb += self.bus.cyc.eq(r1.wb.cyc)
1753
1754         # create submodule TLBUpdate
1755         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1756
1757         # call sub-functions putting everything together, using shared
1758         # signals established above
1759         self.stage_0(m, r0, r1, r0_full)
1760         self.tlb_read(m, r0_stall, tlb_way)
1761         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1762                         tlb_way,
1763                         pte, tlb_hit, valid_ra, perm_attr, ra)
1764         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1765                         tlb_hit, tlb_plru_victim)
1766         self.maybe_plrus(m, r1, plru_victim)
1767         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1768         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1769         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1770                            r0_valid, r1, cache_tags, replace_way,
1771                            use_forward1_next, use_forward2_next,
1772                            req_hit_way, plru_victim, rc_ok, perm_attr,
1773                            valid_ra, perm_ok, access_ok, req_op, req_go,
1774                            tlb_hit, tlb_way, cache_tag_set,
1775                            cancel_store, req_same_tag, r0_stall, early_req_row)
1776         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1777                            r0_valid, r0, reservation)
1778         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1779                            reservation, r0)
1780         self.writeback_control(m, r1, cache_out_row)
1781         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1782         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1783                         req_hit_way, req_index, req_tag, access_ok,
1784                         tlb_hit, tlb_req_index)
1785         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1786                     r0, replace_way,
1787                     req_hit_way, req_same_tag,
1788                          r0_valid, req_op, cache_tags, req_go, ra)
1789         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1790
1791         return m
1792
1793
1794 if __name__ == '__main__':
1795     dut = DCache()
1796     vl = rtlil.convert(dut, ports=[])
1797     with open("test_dcache.il", "w") as f:
1798         f.write(vl)