src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record, Memory)
  29 from nmutil.util import Display
  30 from nmigen.lib.coding import Decoder
  31
  32 from copy import deepcopy
  33 from random import randint, seed
  34
  35 from nmigen_soc.wishbone.bus import Interface
  36
  37 from nmigen.cli import main
  38 from nmutil.iocontrol import RecordObject
  39 from nmigen.utils import log2_int
  40 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  41                                      DCacheToLoadStore1Type,
  42                                      MMUToDCacheType,
  43                                      DCacheToMMUType)
  44
  45 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  46                                 WBAddrType, WBDataType, WBSelType,
  47                                 WBMasterOut, WBSlaveOut,
  48                                 WBMasterOutVector, WBSlaveOutVector,
  49                                 WBIOMasterOut, WBIOSlaveOut)
  50
  51 from soc.experiment.cache_ram import CacheRam
  52 #from soc.experiment.plru import PLRU
  53 from nmutil.plru import PLRU, PLRUs
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmigen.cli import rtlil
  59
  60 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  61 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  62 from nmutil.sim_tmp_alternative import Simulator
  63
  64 from nmutil.util import wrap
  65
  66
  67 # TODO: make these parameters of DCache at some point
  68 LINE_SIZE = 64    # Line size in bytes
  69 NUM_LINES = 16    # Number of lines in a set
  70 NUM_WAYS = 4      # Number of ways
  71 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  72 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  73 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  74 LOG_LENGTH = 0    # Non-zero to enable log data collection
  75
  76 # BRAM organisation: We never access more than
  77 #     -- WB_DATA_BITS at a time so to save
  78 #     -- resources we make the array only that wide, and
  79 #     -- use consecutive indices to make a cache "line"
  80 #     --
  81 #     -- ROW_SIZE is the width in bytes of the BRAM
  82 #     -- (based on WB, so 64-bits)
  83 ROW_SIZE = WB_DATA_BITS // 8;
  84
  85 # ROW_PER_LINE is the number of row (wishbone
  86 # transactions) in a line
  87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  88
  89 # BRAM_ROWS is the number of rows in BRAM needed
  90 # to represent the full dcache
  91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  92
  93 print ("ROW_SIZE", ROW_SIZE)
  94 print ("ROW_PER_LINE", ROW_PER_LINE)
  95 print ("BRAM_ROWS", BRAM_ROWS)
  96 print ("NUM_WAYS", NUM_WAYS)
  97
  98 # Bit fields counts in the address
  99
 100 # REAL_ADDR_BITS is the number of real address
 101 # bits that we store
 102 REAL_ADDR_BITS = 56
 103
 104 # ROW_BITS is the number of bits to select a row
 105 ROW_BITS = log2_int(BRAM_ROWS)
 106
 107 # ROW_LINE_BITS is the number of bits to select
 108 # a row within a line
 109 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 110
 111 # LINE_OFF_BITS is the number of bits for
 112 # the offset in a cache line
 113 LINE_OFF_BITS = log2_int(LINE_SIZE)
 114
 115 # ROW_OFF_BITS is the number of bits for
 116 # the offset in a row
 117 ROW_OFF_BITS = log2_int(ROW_SIZE)
 118
 119 # INDEX_BITS is the number if bits to
 120 # select a cache line
 121 INDEX_BITS = log2_int(NUM_LINES)
 122
 123 # SET_SIZE_BITS is the log base 2 of the set size
 124 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 125
 126 # TAG_BITS is the number of bits of
 127 # the tag part of the address
 128 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 129
 130 # TAG_WIDTH is the width in bits of each way of the tag RAM
 131 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 132
 133 # WAY_BITS is the number of bits to select a way
 134 WAY_BITS = log2_int(NUM_WAYS)
 135
 136 # Example of layout for 32 lines of 64 bytes:
 137 layout = """\
 138   ..  tag    |index|  line  |
 139   ..         |   row   |    |
 140   ..         |     |---|    | ROW_LINE_BITS  (3)
 141   ..         |     |--- - --| LINE_OFF_BITS (6)
 142   ..         |         |- --| ROW_OFF_BITS  (3)
 143   ..         |----- ---|    | ROW_BITS      (8)
 144   ..         |-----|        | INDEX_BITS    (5)
 145   .. --------|              | TAG_BITS      (45)
 146 """
 147 print (layout)
 148 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 149             (TAG_BITS, INDEX_BITS, ROW_BITS,
 150              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 151 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 152 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 153 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 154
 155 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 156
 157 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 158 print ("    TAG_WIDTH", TAG_WIDTH)
 159 print ("     NUM_WAYS", NUM_WAYS)
 160
 161 def CacheTagArray():
 162     tag_layout = [('valid', 1),
 163                   ('tag', TAG_RAM_WIDTH),
 164                  ]
 165     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 166
 167 def RowPerLineValidArray():
 168     return Array(Signal(name="rows_valid%d" % x) \
 169                         for x in range(ROW_PER_LINE))
 170
 171 # L1 TLB
 172 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 173 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 174 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 175 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 176 TLB_PTE_BITS     = 64
 177 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 178
 179 def ispow2(x):
 180     return (1<<log2_int(x, False)) == x
 181
 182 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 183 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 184 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 185 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 186 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 187 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 188         "geometry bits don't add up"
 189 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 190         "geometry bits don't add up"
 191 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 192          "geometry bits don't add up"
 193 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 194 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 195
 196
 197 def TLBHit(name):
 198     return Record([('valid', 1),
 199                    ('way', TLB_WAY_BITS)], name=name)
 200
 201 def TLBTagEAArray():
 202     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 203                 for x in range (TLB_NUM_WAYS))
 204
 205 def TLBRecord(name):
 206     tlb_layout = [('valid', TLB_NUM_WAYS),
 207                   ('tag', TLB_TAG_WAY_BITS),
 208                   ('pte', TLB_PTE_WAY_BITS)
 209                  ]
 210     return Record(tlb_layout, name=name)
 211
 212 def TLBValidArray():
 213     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 214                         for x in range(TLB_SET_SIZE))
 215
 216 def HitWaySet():
 217     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 218                         for x in range(TLB_NUM_WAYS))
 219
 220 # Cache RAM interface
 221 def CacheRamOut():
 222     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 223                  for x in range(NUM_WAYS))
 224
 225 # PLRU output interface
 226 def PLRUOut():
 227     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 228                 for x in range(NUM_LINES))
 229
 230 # TLB PLRU output interface
 231 def TLBPLRUOut():
 232     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 233                 for x in range(TLB_SET_SIZE))
 234
 235 # Helper functions to decode incoming requests
 236 #
 237 # Return the cache line index (tag index) for an address
 238 def get_index(addr):
 239     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 240
 241 # Return the cache row index (data memory) for an address
 242 def get_row(addr):
 243     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 244
 245 # Return the index of a row within a line
 246 def get_row_of_line(row):
 247     return row[:ROW_BITS][:ROW_LINE_BITS]
 248
 249 # Returns whether this is the last row of a line
 250 def is_last_row_addr(addr, last):
 251     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 252
 253 # Returns whether this is the last row of a line
 254 def is_last_row(row, last):
 255     return get_row_of_line(row) == last
 256
 257 # Return the next row in the current cache line. We use a
 258 # dedicated function in order to limit the size of the
 259 # generated adder to be only the bits within a cache line
 260 # (3 bits with default settings)
 261 def next_row(row):
 262     row_v = row[0:ROW_LINE_BITS] + 1
 263     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 264
 265 # Get the tag value from the address
 266 def get_tag(addr):
 267     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 268
 269 # Read a tag from a tag memory row
 270 def read_tag(way, tagset):
 271     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 272
 273 # Read a TLB tag from a TLB tag memory row
 274 def read_tlb_tag(way, tags):
 275     return tags.word_select(way, TLB_EA_TAG_BITS)
 276
 277 # Write a TLB tag to a TLB tag memory row
 278 def write_tlb_tag(way, tags, tag):
 279     return read_tlb_tag(way, tags).eq(tag)
 280
 281 # Read a PTE from a TLB PTE memory row
 282 def read_tlb_pte(way, ptes):
 283     return ptes.word_select(way, TLB_PTE_BITS)
 284
 285 def write_tlb_pte(way, ptes, newpte):
 286     return read_tlb_pte(way, ptes).eq(newpte)
 287
 288
 289 # Record for storing permission, attribute, etc. bits from a PTE
 290 class PermAttr(RecordObject):
 291     def __init__(self, name=None):
 292         super().__init__(name=name)
 293         self.reference = Signal()
 294         self.changed   = Signal()
 295         self.nocache   = Signal()
 296         self.priv      = Signal()
 297         self.rd_perm   = Signal()
 298         self.wr_perm   = Signal()
 299
 300
 301 def extract_perm_attr(pte):
 302     pa = PermAttr()
 303     return pa;
 304
 305
 306 # Type of operation on a "valid" input
 307 @unique
 308 class Op(Enum):
 309     OP_NONE       = 0
 310     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 311     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 312     OP_LOAD_HIT   = 3 # Cache hit on load
 313     OP_LOAD_MISS  = 4 # Load missing cache
 314     OP_LOAD_NC    = 5 # Non-cachable load
 315     OP_STORE_HIT  = 6 # Store hitting cache
 316     OP_STORE_MISS = 7 # Store missing cache
 317
 318
 319 # Cache state machine
 320 @unique
 321 class State(Enum):
 322     IDLE             = 0 # Normal load hit processing
 323     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 324     STORE_WAIT_ACK   = 2 # Store wait ack
 325     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 326
 327
 328 # Dcache operations:
 329 #
 330 # In order to make timing, we use the BRAMs with
 331 # an output buffer, which means that the BRAM
 332 # output is delayed by an extra cycle.
 333 #
 334 # Thus, the dcache has a 2-stage internal pipeline
 335 # for cache hits with no stalls.
 336 #
 337 # All other operations are handled via stalling
 338 # in the first stage.
 339 #
 340 # The second stage can thus complete a hit at the same
 341 # time as the first stage emits a stall for a complex op.
 342 #
 343 # Stage 0 register, basically contains just the latched request
 344
 345 class RegStage0(RecordObject):
 346     def __init__(self, name=None):
 347         super().__init__(name=name)
 348         self.req     = LoadStore1ToDCacheType(name="lsmem")
 349         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 350         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 351         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 352         self.mmu_req = Signal() # indicates source of request
 353         self.d_valid = Signal() # indicates req.data is valid now
 354
 355
 356 class MemAccessRequest(RecordObject):
 357     def __init__(self, name=None):
 358         super().__init__(name=name)
 359         self.op        = Signal(Op)
 360         self.valid     = Signal()
 361         self.dcbz      = Signal()
 362         self.real_addr = Signal(REAL_ADDR_BITS)
 363         self.data      = Signal(64)
 364         self.byte_sel  = Signal(8)
 365         self.hit_way   = Signal(WAY_BITS)
 366         self.same_tag  = Signal()
 367         self.mmu_req   = Signal()
 368
 369
 370 # First stage register, contains state for stage 1 of load hits
 371 # and for the state machine used by all other operations
 372 class RegStage1(RecordObject):
 373     def __init__(self, name=None):
 374         super().__init__(name=name)
 375         # Info about the request
 376         self.full             = Signal() # have uncompleted request
 377         self.mmu_req          = Signal() # request is from MMU
 378         self.req              = MemAccessRequest(name="reqmem")
 379
 380         # Cache hit state
 381         self.hit_way          = Signal(WAY_BITS)
 382         self.hit_load_valid   = Signal()
 383         self.hit_index        = Signal(INDEX_BITS)
 384         self.cache_hit        = Signal()
 385
 386         # TLB hit state
 387         self.tlb_hit          = TLBHit("tlb_hit")
 388         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 389
 390         # 2-stage data buffer for data forwarded from writes to reads
 391         self.forward_data1    = Signal(64)
 392         self.forward_data2    = Signal(64)
 393         self.forward_sel1     = Signal(8)
 394         self.forward_valid1   = Signal()
 395         self.forward_way1     = Signal(WAY_BITS)
 396         self.forward_row1     = Signal(ROW_BITS)
 397         self.use_forward1     = Signal()
 398         self.forward_sel      = Signal(8)
 399
 400         # Cache miss state (reload state machine)
 401         self.state            = Signal(State)
 402         self.dcbz             = Signal()
 403         self.write_bram       = Signal()
 404         self.write_tag        = Signal()
 405         self.slow_valid       = Signal()
 406         self.wb               = WBMasterOut("wb")
 407         self.reload_tag       = Signal(TAG_BITS)
 408         self.store_way        = Signal(WAY_BITS)
 409         self.store_row        = Signal(ROW_BITS)
 410         self.store_index      = Signal(INDEX_BITS)
 411         self.end_row_ix       = Signal(ROW_LINE_BITS)
 412         self.rows_valid       = RowPerLineValidArray()
 413         self.acks_pending     = Signal(3)
 414         self.inc_acks         = Signal()
 415         self.dec_acks         = Signal()
 416
 417         # Signals to complete (possibly with error)
 418         self.ls_valid         = Signal()
 419         self.ls_error         = Signal()
 420         self.mmu_done         = Signal()
 421         self.mmu_error        = Signal()
 422         self.cache_paradox    = Signal()
 423
 424         # Signal to complete a failed stcx.
 425         self.stcx_fail        = Signal()
 426
 427
 428 # Reservation information
 429 class Reservation(RecordObject):
 430     def __init__(self):
 431         super().__init__()
 432         self.valid = Signal()
 433         self.addr  = Signal(64-LINE_OFF_BITS)
 434
 435
 436 class DTLBUpdate(Elaboratable):
 437     def __init__(self):
 438         self.tlbie    = Signal()
 439         self.tlbwe    = Signal()
 440         self.doall    = Signal()
 441         self.tlb_hit     = TLBHit("tlb_hit")
 442         self.tlb_req_index = Signal(TLB_SET_BITS)
 443
 444         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 445         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 446         self.repl_way        = Signal(TLB_WAY_BITS)
 447         self.eatag           = Signal(TLB_EA_TAG_BITS)
 448         self.pte_data        = Signal(TLB_PTE_BITS)
 449
 450         # read from dtlb array
 451         self.tlb_read       = Signal()
 452         self.tlb_read_index = Signal(TLB_SET_BITS)
 453         self.tlb_way        = TLBRecord("o_tlb_way")
 454
 455     def elaborate(self, platform):
 456         m = Module()
 457         comb = m.d.comb
 458         sync = m.d.sync
 459
 460         # there are 3 parts to this:
 461         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 462         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 463         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 464         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 465         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 466         # hmmm....
 467
 468         dtlb_valid = TLBValidArray()
 469         tlb_req_index = self.tlb_req_index
 470
 471         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 472         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 473         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 474         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 475         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 476         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 477
 478         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 479         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 480         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 481         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 482                                     granularity=TLB_EA_TAG_BITS)
 483
 484         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 485         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 486         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 487                                     granularity=TLB_PTE_BITS)
 488
 489         # commented out for now, can be put in if Memory.reset can be
 490         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 491         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 492         #m.submodules.rd_valid = rd_valid = validm.read_port()
 493         #m.submodules.wr_valid = wr_valid = validm.write_port(
 494                                     #granularity=1)
 495
 496         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 497         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 498         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 499         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 500         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 501         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 502         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 503
 504         updated  = Signal()
 505         v_updated  = Signal()
 506         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 507         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 508         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 509         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 510
 511         comb += dv.eq(dtlb_valid[tlb_req_index])
 512         comb += db_out.eq(dv)
 513
 514         with m.If(self.tlbie & self.doall):
 515             # clear all valid bits at once
 516             # XXX hmmm, validm _could_ use Memory reset here...
 517             for i in range(TLB_SET_SIZE):
 518                 sync += dtlb_valid[i].eq(0)
 519         with m.Elif(self.tlbie):
 520             # invalidate just the hit_way
 521             with m.If(self.tlb_hit.valid):
 522                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 523                 comb += v_updated.eq(1)
 524         with m.Elif(self.tlbwe):
 525             # write to the requested tag and PTE
 526             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 527             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 528             # set valid bit
 529             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 530
 531             comb += updated.eq(1)
 532             comb += v_updated.eq(1)
 533
 534         # above, sometimes valid is requested to be updated but data not
 535         # therefore split them out, here.  note the granularity thing matches
 536         # with the shift-up of the eatag/pte_data into the correct TLB way.
 537         # thus is it not necessary to write the entire lot, just the portion
 538         # being altered: hence writing the *old* copy of the row is not needed
 539         with m.If(updated): # PTE and TAG to be written
 540             comb += wr_pteway.data.eq(pb_out)
 541             comb += wr_pteway.en.eq(1<<self.repl_way)
 542             comb += wr_tagway.data.eq(tb_out)
 543             comb += wr_tagway.en.eq(1<<self.repl_way)
 544         with m.If(v_updated): # Valid to be written
 545             sync += dtlb_valid[tlb_req_index].eq(db_out)
 546             #comb += wr_valid.data.eq(db_out)
 547             #comb += wr_valid.en.eq(1<<self.repl_way)
 548
 549         # select one TLB way, use a register here
 550         r_tlb_way        = TLBRecord("r_tlb_way")
 551         r_delay = Signal()
 552         sync += r_delay.eq(self.tlb_read)
 553         with m.If(self.tlb_read):
 554             sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 555         with m.If(r_delay):
 556             # on one clock delay, output the contents of the read port(s)
 557             # comb += self.tlb_way.valid.eq(rd_valid.data)
 558             comb += self.tlb_way.tag.eq(rd_tagway.data)
 559             comb += self.tlb_way.pte.eq(rd_pteway.data)
 560             # and also capture the (delayed) output...
 561             #sync += r_tlb_way.valid.eq(rd_valid.data)
 562             sync += r_tlb_way.tag.eq(rd_tagway.data)
 563             sync += r_tlb_way.pte.eq(rd_pteway.data)
 564         with m.Else():
 565             # ... so that the register can output it when no read is requested
 566             # it's rather overkill but better to be safe than sorry
 567             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 568             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 569             #comb += self.tlb_way.eq(r_tlb_way)
 570
 571         return m
 572
 573
 574 class DCachePendingHit(Elaboratable):
 575
 576     def __init__(self, tlb_way,
 577                       cache_i_validdx, cache_tag_set,
 578                     req_addr,
 579                     hit_set):
 580
 581         self.go          = Signal()
 582         self.virt_mode   = Signal()
 583         self.is_hit      = Signal()
 584         self.tlb_hit      = TLBHit("tlb_hit")
 585         self.hit_way     = Signal(WAY_BITS)
 586         self.rel_match   = Signal()
 587         self.req_index   = Signal(INDEX_BITS)
 588         self.reload_tag  = Signal(TAG_BITS)
 589
 590         self.tlb_way = tlb_way
 591         self.cache_i_validdx = cache_i_validdx
 592         self.cache_tag_set = cache_tag_set
 593         self.req_addr = req_addr
 594         self.hit_set = hit_set
 595
 596     def elaborate(self, platform):
 597         m = Module()
 598         comb = m.d.comb
 599         sync = m.d.sync
 600
 601         go = self.go
 602         virt_mode = self.virt_mode
 603         is_hit = self.is_hit
 604         tlb_way = self.tlb_way
 605         cache_i_validdx = self.cache_i_validdx
 606         cache_tag_set = self.cache_tag_set
 607         req_addr = self.req_addr
 608         tlb_hit = self.tlb_hit
 609         hit_set = self.hit_set
 610         hit_way = self.hit_way
 611         rel_match = self.rel_match
 612         req_index = self.req_index
 613         reload_tag = self.reload_tag
 614
 615         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 616                                     for i in range(TLB_NUM_WAYS))
 617         hit_way_set = HitWaySet()
 618
 619         # Test if pending request is a hit on any way
 620         # In order to make timing in virtual mode,
 621         # when we are using the TLB, we compare each
 622         # way with each of the real addresses from each way of
 623         # the TLB, and then decide later which match to use.
 624
 625         with m.If(virt_mode):
 626             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 627                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 628                 s_hit       = Signal()
 629                 s_pte       = Signal(TLB_PTE_BITS)
 630                 s_ra        = Signal(REAL_ADDR_BITS)
 631                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 632                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 633                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 634                 comb += s_tag.eq(get_tag(s_ra))
 635
 636                 for i in range(NUM_WAYS): # way_t
 637                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 638                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 639                                   (read_tag(i, cache_tag_set) == s_tag)
 640                                   & (tlb_way.valid[j]))
 641                     with m.If(is_tag_hit):
 642                         comb += hit_way_set[j].eq(i)
 643                         comb += s_hit.eq(1)
 644                 comb += hit_set[j].eq(s_hit)
 645                 with m.If(s_tag == reload_tag):
 646                     comb += rel_matches[j].eq(1)
 647             with m.If(tlb_hit.valid):
 648                 comb += is_hit.eq(hit_set[tlb_hit.way])
 649                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 650                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 651         with m.Else():
 652             s_tag       = Signal(TAG_BITS)
 653             comb += s_tag.eq(get_tag(req_addr))
 654             for i in range(NUM_WAYS): # way_t
 655                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 656                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 657                           (read_tag(i, cache_tag_set) == s_tag))
 658                 with m.If(is_tag_hit):
 659                     comb += hit_way.eq(i)
 660                     comb += is_hit.eq(1)
 661             with m.If(s_tag == reload_tag):
 662                 comb += rel_match.eq(1)
 663
 664         return m
 665
 666
 667 class DCache(Elaboratable):
 668     """Set associative dcache write-through
 669
 670     TODO (in no specific order):
 671     * See list in icache.vhdl
 672     * Complete load misses on the cycle when WB data comes instead of
 673       at the end of line (this requires dealing with requests coming in
 674       while not idle...)
 675     """
 676     def __init__(self):
 677         self.d_in      = LoadStore1ToDCacheType("d_in")
 678         self.d_out     = DCacheToLoadStore1Type("d_out")
 679
 680         self.m_in      = MMUToDCacheType("m_in")
 681         self.m_out     = DCacheToMMUType("m_out")
 682
 683         self.stall_out = Signal()
 684
 685         # standard naming (wired to non-standard for compatibility)
 686         self.bus = Interface(addr_width=32,
 687                             data_width=64,
 688                             granularity=8,
 689                             features={'stall'},
 690                             alignment=0,
 691                             name="dcache")
 692
 693         self.log_out   = Signal(20)
 694
 695     def stage_0(self, m, r0, r1, r0_full):
 696         """Latch the request in r0.req as long as we're not stalling
 697         """
 698         comb = m.d.comb
 699         sync = m.d.sync
 700         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 701
 702         r = RegStage0("stage0")
 703
 704         # TODO, this goes in unit tests and formal proofs
 705         with m.If(d_in.valid & m_in.valid):
 706             sync += Display("request collision loadstore vs MMU")
 707
 708         with m.If(m_in.valid):
 709             comb += r.req.valid.eq(1)
 710             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 711             comb += r.req.dcbz.eq(0)
 712             comb += r.req.nc.eq(0)
 713             comb += r.req.reserve.eq(0)
 714             comb += r.req.virt_mode.eq(0)
 715             comb += r.req.priv_mode.eq(1)
 716             comb += r.req.addr.eq(m_in.addr)
 717             comb += r.req.data.eq(m_in.pte)
 718             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 719             comb += r.tlbie.eq(m_in.tlbie)
 720             comb += r.doall.eq(m_in.doall)
 721             comb += r.tlbld.eq(m_in.tlbld)
 722             comb += r.mmu_req.eq(1)
 723             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 724                                  m_in.addr, m_in.pte, r.req.load)
 725
 726         with m.Else():
 727             comb += r.req.eq(d_in)
 728             comb += r.req.data.eq(0)
 729             comb += r.tlbie.eq(0)
 730             comb += r.doall.eq(0)
 731             comb += r.tlbld.eq(0)
 732             comb += r.mmu_req.eq(0)
 733         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 734             sync += r0.eq(r)
 735             sync += r0_full.eq(r.req.valid)
 736             # Sample data the cycle after a request comes in from loadstore1.
 737             # If another request has come in already then the data will get
 738             # put directly into req.data below.
 739             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 740                      ~r0.mmu_req):
 741                 sync += r0.req.data.eq(d_in.data)
 742                 sync += r0.d_valid.eq(1)
 743         with m.If(d_in.valid):
 744             m.d.sync += Display("    DCACHE req cache "
 745                                 "virt %d addr %x data %x ld %d",
 746                                  r.req.virt_mode, r.req.addr,
 747                                  r.req.data, r.req.load)
 748
 749     def tlb_read(self, m, r0_stall, tlb_way):
 750         """TLB
 751         Operates in the second cycle on the request latched in r0.req.
 752         TLB updates write the entry at the end of the second cycle.
 753         """
 754         comb = m.d.comb
 755         sync = m.d.sync
 756         m_in, d_in = self.m_in, self.d_in
 757
 758         addrbits = Signal(TLB_SET_BITS)
 759
 760         amin = TLB_LG_PGSZ
 761         amax = TLB_LG_PGSZ + TLB_SET_BITS
 762
 763         with m.If(m_in.valid):
 764             comb += addrbits.eq(m_in.addr[amin : amax])
 765         with m.Else():
 766             comb += addrbits.eq(d_in.addr[amin : amax])
 767
 768         # If we have any op and the previous op isn't finished,
 769         # then keep the same output for next cycle.
 770         d = self.dtlb_update
 771         comb += d.tlb_read_index.eq(addrbits)
 772         comb += d.tlb_read.eq(~r0_stall)
 773         comb += tlb_way.eq(d.tlb_way)
 774
 775     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 776         """Generate TLB PLRUs
 777         """
 778         comb = m.d.comb
 779         sync = m.d.sync
 780
 781         if TLB_NUM_WAYS == 0:
 782             return
 783
 784         # Binary-to-Unary one-hot, enabled by tlb_hit valid
 785         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 786         m.submodules.tlb_plrus = tlb_plrus
 787         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 788         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 789         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 790         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 791         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 792
 793     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 794                    tlb_way,
 795                    pte, tlb_hit, valid_ra, perm_attr, ra):
 796
 797         comb = m.d.comb
 798
 799         hitway = Signal(TLB_WAY_BITS)
 800         hit    = Signal()
 801         eatag  = Signal(TLB_EA_TAG_BITS)
 802
 803         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 804         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 805         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 806
 807         for i in range(TLB_NUM_WAYS):
 808             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 809             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 810             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 811             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 812             with m.If(is_tag_hit):
 813                 comb += hitway.eq(i)
 814                 comb += hit.eq(1)
 815
 816         comb += tlb_hit.valid.eq(hit & r0_valid)
 817         comb += tlb_hit.way.eq(hitway)
 818
 819         with m.If(tlb_hit.valid):
 820             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 821         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 822
 823         with m.If(r0.req.virt_mode):
 824             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 825                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 826                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 827             comb += perm_attr.reference.eq(pte[8])
 828             comb += perm_attr.changed.eq(pte[7])
 829             comb += perm_attr.nocache.eq(pte[5])
 830             comb += perm_attr.priv.eq(pte[3])
 831             comb += perm_attr.rd_perm.eq(pte[2])
 832             comb += perm_attr.wr_perm.eq(pte[1])
 833         with m.Else():
 834             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 835                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 836             comb += perm_attr.reference.eq(1)
 837             comb += perm_attr.changed.eq(1)
 838             comb += perm_attr.nocache.eq(0)
 839             comb += perm_attr.priv.eq(1)
 840             comb += perm_attr.rd_perm.eq(1)
 841             comb += perm_attr.wr_perm.eq(1)
 842
 843         with m.If(valid_ra):
 844             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 845                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 846             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 847             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 848             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 849             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 850             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 851             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 852
 853     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 854                     tlb_hit, tlb_plru_victim, tlb_way):
 855
 856         comb = m.d.comb
 857         sync = m.d.sync
 858
 859         tlbie    = Signal()
 860         tlbwe    = Signal()
 861
 862         comb += tlbie.eq(r0_valid & r0.tlbie)
 863         comb += tlbwe.eq(r0_valid & r0.tlbld)
 864
 865         d = self.dtlb_update
 866
 867         comb += d.tlbie.eq(tlbie)
 868         comb += d.tlbwe.eq(tlbwe)
 869         comb += d.doall.eq(r0.doall)
 870         comb += d.tlb_hit.eq(tlb_hit)
 871         comb += d.tlb_tag_way.eq(tlb_way.tag)
 872         comb += d.tlb_pte_way.eq(tlb_way.pte)
 873         comb += d.tlb_req_index.eq(tlb_req_index)
 874
 875         with m.If(tlb_hit.valid):
 876             comb += d.repl_way.eq(tlb_hit.way)
 877         with m.Else():
 878             comb += d.repl_way.eq(tlb_plru_victim)
 879         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 880         comb += d.pte_data.eq(r0.req.data)
 881
 882     def maybe_plrus(self, m, r1, plru_victim):
 883         """Generate PLRUs
 884         """
 885         comb = m.d.comb
 886         sync = m.d.sync
 887
 888         if TLB_NUM_WAYS == 0:
 889             return
 890
 891         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 892         comb += plrus.way.eq(r1.hit_way)
 893         comb += plrus.valid.eq(r1.cache_hit)
 894         comb += plrus.index.eq(r1.hit_index)
 895         comb += plrus.isel.eq(r1.store_index) # select victim
 896         comb += plru_victim.eq(plrus.o_index) # selected victim
 897
 898     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 899         """Cache tag RAM read port
 900         """
 901         comb = m.d.comb
 902         sync = m.d.sync
 903         m_in, d_in = self.m_in, self.d_in
 904
 905         index = Signal(INDEX_BITS)
 906
 907         with m.If(r0_stall):
 908             comb += index.eq(req_index)
 909         with m.Elif(m_in.valid):
 910             comb += index.eq(get_index(m_in.addr))
 911         with m.Else():
 912             comb += index.eq(get_index(d_in.addr))
 913         sync += cache_tag_set.eq(cache_tags[index].tag)
 914
 915     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 916                        r0_valid, r1, cache_tags, replace_way,
 917                        use_forward1_next, use_forward2_next,
 918                        req_hit_way, plru_victim, rc_ok, perm_attr,
 919                        valid_ra, perm_ok, access_ok, req_op, req_go,
 920                        tlb_hit, tlb_way, cache_tag_set,
 921                        cancel_store, req_same_tag, r0_stall, early_req_row):
 922         """Cache request parsing and hit detection
 923         """
 924
 925         comb = m.d.comb
 926         m_in, d_in = self.m_in, self.d_in
 927
 928         is_hit      = Signal()
 929         hit_way     = Signal(WAY_BITS)
 930         op          = Signal(Op)
 931         opsel       = Signal(3)
 932         go          = Signal()
 933         nc          = Signal()
 934         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 935                                   for i in range(TLB_NUM_WAYS))
 936         cache_i_validdx = Signal(NUM_WAYS)
 937
 938         # Extract line, row and tag from request
 939         comb += req_index.eq(get_index(r0.req.addr))
 940         comb += req_row.eq(get_row(r0.req.addr))
 941         comb += req_tag.eq(get_tag(ra))
 942
 943         if False: # display on comb is a bit... busy.
 944             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 945                     r0.req.addr, ra, req_index, req_tag, req_row)
 946
 947         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 948         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 949
 950         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 951                                             cache_i_validdx, cache_tag_set,
 952                                             r0.req.addr,
 953                                             hit_set)
 954         comb += dc.tlb_hit.eq(tlb_hit)
 955         comb += dc.reload_tag.eq(r1.reload_tag)
 956         comb += dc.virt_mode.eq(r0.req.virt_mode)
 957         comb += dc.go.eq(go)
 958         comb += dc.req_index.eq(req_index)
 959
 960         comb += is_hit.eq(dc.is_hit)
 961         comb += hit_way.eq(dc.hit_way)
 962         comb += req_same_tag.eq(dc.rel_match)
 963
 964         # See if the request matches the line currently being reloaded
 965         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 966                   (req_index == r1.store_index) & req_same_tag):
 967             # For a store, consider this a hit even if the row isn't
 968             # valid since it will be by the time we perform the store.
 969             # For a load, check the appropriate row valid bit.
 970             rrow = Signal(ROW_LINE_BITS)
 971             comb += rrow.eq(req_row)
 972             valid = r1.rows_valid[rrow]
 973             comb += is_hit.eq((~r0.req.load) | valid)
 974             comb += hit_way.eq(replace_way)
 975
 976         # Whether to use forwarded data for a load or not
 977         with m.If((get_row(r1.req.real_addr) == req_row) &
 978                   (r1.req.hit_way == hit_way)):
 979             # Only need to consider r1.write_bram here, since if we
 980             # are writing refill data here, then we don't have a
 981             # cache hit this cycle on the line being refilled.
 982             # (There is the possibility that the load following the
 983             # load miss that started the refill could be to the old
 984             # contents of the victim line, since it is a couple of
 985             # cycles after the refill starts before we see the updated
 986             # cache tag. In that case we don't use the bypass.)
 987             comb += use_forward1_next.eq(r1.write_bram)
 988         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 989             comb += use_forward2_next.eq(r1.forward_valid1)
 990
 991         # The way that matched on a hit
 992         comb += req_hit_way.eq(hit_way)
 993
 994         # The way to replace on a miss
 995         with m.If(r1.write_tag):
 996             comb += replace_way.eq(plru_victim)
 997         with m.Else():
 998             comb += replace_way.eq(r1.store_way)
 999
1000         # work out whether we have permission for this access
1001         # NB we don't yet implement AMR, thus no KUAP
1002         comb += rc_ok.eq(perm_attr.reference
1003                          & (r0.req.load | perm_attr.changed))
1004         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1005                            (perm_attr.wr_perm |
1006                               (r0.req.load & perm_attr.rd_perm)))
1007         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1008
1009         # Combine the request and cache hit status to decide what
1010         # operation needs to be done
1011         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1012         comb += op.eq(Op.OP_NONE)
1013         with m.If(go):
1014             with m.If(~access_ok):
1015                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1016                                  valid_ra, perm_ok, rc_ok)
1017                 comb += op.eq(Op.OP_BAD)
1018             with m.Elif(cancel_store):
1019                 m.d.sync += Display("DCACHE cancel store")
1020                 comb += op.eq(Op.OP_STCX_FAIL)
1021             with m.Else():
1022                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1023                                  valid_ra, nc, r0.req.load)
1024                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1025                 with m.Switch(opsel):
1026                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1027                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1028                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1029                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1030                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1031                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1032                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1033                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1034         comb += req_op.eq(op)
1035         comb += req_go.eq(go)
1036
1037         # Version of the row number that is valid one cycle earlier
1038         # in the cases where we need to read the cache data BRAM.
1039         # If we're stalling then we need to keep reading the last
1040         # row requested.
1041         with m.If(~r0_stall):
1042             with m.If(m_in.valid):
1043                 comb += early_req_row.eq(get_row(m_in.addr))
1044             with m.Else():
1045                 comb += early_req_row.eq(get_row(d_in.addr))
1046         with m.Else():
1047             comb += early_req_row.eq(req_row)
1048
1049     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1050                          r0_valid, r0, reservation):
1051         """Handle load-with-reservation and store-conditional instructions
1052         """
1053         comb = m.d.comb
1054
1055         with m.If(r0_valid & r0.req.reserve):
1056             # XXX generate alignment interrupt if address
1057             # is not aligned XXX or if r0.req.nc = '1'
1058             with m.If(r0.req.load):
1059                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1060             with m.Else():
1061                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1062                 with m.If((~reservation.valid) |
1063                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1064                     comb += cancel_store.eq(1)
1065
1066     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1067                         reservation, r0):
1068         comb = m.d.comb
1069         sync = m.d.sync
1070
1071         with m.If(r0_valid & access_ok):
1072             with m.If(clear_rsrv):
1073                 sync += reservation.valid.eq(0)
1074             with m.Elif(set_rsrv):
1075                 sync += reservation.valid.eq(1)
1076                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1077
1078     def writeback_control(self, m, r1, cache_out_row):
1079         """Return data for loads & completion control logic
1080         """
1081         comb = m.d.comb
1082         sync = m.d.sync
1083         d_out, m_out = self.d_out, self.m_out
1084
1085         data_out = Signal(64)
1086         data_fwd = Signal(64)
1087
1088         # Use the bypass if are reading the row that was
1089         # written 1 or 2 cycles ago, including for the
1090         # slow_valid = 1 case (i.e. completing a load
1091         # miss or a non-cacheable load).
1092         with m.If(r1.use_forward1):
1093             comb += data_fwd.eq(r1.forward_data1)
1094         with m.Else():
1095             comb += data_fwd.eq(r1.forward_data2)
1096
1097         comb += data_out.eq(cache_out_row)
1098
1099         for i in range(8):
1100             with m.If(r1.forward_sel[i]):
1101                 dsel = data_fwd.word_select(i, 8)
1102                 comb += data_out.word_select(i, 8).eq(dsel)
1103
1104         # DCache output to LoadStore
1105         comb += d_out.valid.eq(r1.ls_valid)
1106         comb += d_out.data.eq(data_out)
1107         comb += d_out.store_done.eq(~r1.stcx_fail)
1108         comb += d_out.error.eq(r1.ls_error)
1109         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1110
1111         # Outputs to MMU
1112         comb += m_out.done.eq(r1.mmu_done)
1113         comb += m_out.err.eq(r1.mmu_error)
1114         comb += m_out.data.eq(data_out)
1115
1116         # We have a valid load or store hit or we just completed
1117         # a slow op such as a load miss, a NC load or a store
1118         #
1119         # Note: the load hit is delayed by one cycle. However it
1120         # can still not collide with r.slow_valid (well unless I
1121         # miscalculated) because slow_valid can only be set on a
1122         # subsequent request and not on its first cycle (the state
1123         # machine must have advanced), which makes slow_valid
1124         # at least 2 cycles from the previous hit_load_valid.
1125
1126         # Sanity: Only one of these must be set in any given cycle
1127
1128         if False: # TODO: need Display to get this to work
1129             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1130             "unexpected slow_valid collision with stcx_fail"
1131
1132             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1133              "unexpected hit_load_delayed collision with slow_valid"
1134
1135         with m.If(~r1.mmu_req):
1136             # Request came from loadstore1...
1137             # Load hit case is the standard path
1138             with m.If(r1.hit_load_valid):
1139                 sync += Display("completing load hit data=%x", data_out)
1140
1141             # error cases complete without stalling
1142             with m.If(r1.ls_error):
1143                 with m.If(r1.dcbz):
1144                     sync += Display("completing dcbz with error")
1145                 with m.Else():
1146                     sync += Display("completing ld/st with error")
1147
1148             # Slow ops (load miss, NC, stores)
1149             with m.If(r1.slow_valid):
1150                 sync += Display("completing store or load miss adr=%x data=%x",
1151                                 r1.req.real_addr, data_out)
1152
1153         with m.Else():
1154             # Request came from MMU
1155             with m.If(r1.hit_load_valid):
1156                 sync += Display("completing load hit to MMU, data=%x",
1157                                 m_out.data)
1158             # error cases complete without stalling
1159             with m.If(r1.mmu_error):
1160                 sync += Display("combpleting MMU ld with error")
1161
1162             # Slow ops (i.e. load miss)
1163             with m.If(r1.slow_valid):
1164                 sync += Display("completing MMU load miss, adr=%x data=%x",
1165                                 r1.req.real_addr, m_out.data)
1166
1167     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1168         """rams
1169         Generate a cache RAM for each way. This handles the normal
1170         reads, writes from reloads and the special store-hit update
1171         path as well.
1172
1173         Note: the BRAMs have an extra read buffer, meaning the output
1174         is pipelined an extra cycle. This differs from the
1175         icache. The writeback logic needs to take that into
1176         account by using 1-cycle delayed signals for load hits.
1177         """
1178         comb = m.d.comb
1179         bus = self.bus
1180
1181         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1182         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1183         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1184         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1185                    ~r1.write_bram))
1186         comb += rwe.i.eq(replace_way)
1187
1188         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1189         comb += hwe.i.eq(r1.hit_way)
1190
1191         # this one is gated with write_bram, and replace_way_e can never be
1192         # set at the same time.  that means that do_write can OR the outputs
1193         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1194         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1195         comb += hre.i.eq(r1.req.hit_way)
1196
1197         # common Signals
1198         do_read  = Signal()
1199         wr_addr  = Signal(ROW_BITS)
1200         wr_data  = Signal(WB_DATA_BITS)
1201         wr_sel   = Signal(ROW_SIZE)
1202         rd_addr  = Signal(ROW_BITS)
1203
1204         comb += do_read.eq(1) # always enable
1205         comb += rd_addr.eq(early_req_row)
1206
1207         # Write mux:
1208         #
1209         # Defaults to wishbone read responses (cache refill)
1210         #
1211         # For timing, the mux on wr_data/sel/addr is not
1212         # dependent on anything other than the current state.
1213
1214         with m.If(r1.write_bram):
1215             # Write store data to BRAM.  This happens one
1216             # cycle after the store is in r0.
1217             comb += wr_data.eq(r1.req.data)
1218             comb += wr_sel.eq(r1.req.byte_sel)
1219             comb += wr_addr.eq(get_row(r1.req.real_addr))
1220
1221         with m.Else():
1222             # Otherwise, we might be doing a reload or a DCBZ
1223             with m.If(r1.dcbz):
1224                 comb += wr_data.eq(0)
1225             with m.Else():
1226                 comb += wr_data.eq(bus.dat_r)
1227             comb += wr_addr.eq(r1.store_row)
1228             comb += wr_sel.eq(~0) # all 1s
1229
1230         # set up Cache Rams
1231         for i in range(NUM_WAYS):
1232             do_write = Signal(name="do_wr%d" % i)
1233             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1234             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1235
1236             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1237             setattr(m.submodules, "cacheram_%d" % i, way)
1238
1239             comb += way.rd_en.eq(do_read)
1240             comb += way.rd_addr.eq(rd_addr)
1241             comb += d_out.eq(way.rd_data_o)
1242             comb += way.wr_sel.eq(wr_sel_m)
1243             comb += way.wr_addr.eq(wr_addr)
1244             comb += way.wr_data.eq(wr_data)
1245
1246             # Cache hit reads
1247             with m.If(hwe.o[i]):
1248                 comb += cache_out_row.eq(d_out)
1249
1250             # these are mutually-exclusive via their Decoder-enablers
1251             # (note: Decoder-enable is inverted)
1252             comb += do_write.eq(hre.o[i] | rwe.o[i])
1253
1254             # Mask write selects with do_write since BRAM
1255             # doesn't have a global write-enable
1256             with m.If(do_write):
1257                 comb += wr_sel_m.eq(wr_sel)
1258
1259     # Cache hit synchronous machine for the easy case.
1260     # This handles load hits.
1261     # It also handles error cases (TLB miss, cache paradox)
1262     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1263                         req_hit_way, req_index, req_tag, access_ok,
1264                         tlb_hit, tlb_req_index):
1265         comb = m.d.comb
1266         sync = m.d.sync
1267
1268         with m.If(req_op != Op.OP_NONE):
1269             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1270                     req_op, r0.req.addr, r0.req.nc,
1271                     req_index, req_tag, req_hit_way)
1272
1273         with m.If(r0_valid):
1274             sync += r1.mmu_req.eq(r0.mmu_req)
1275
1276         # Fast path for load/store hits.
1277         # Set signals for the writeback controls.
1278         sync += r1.hit_way.eq(req_hit_way)
1279         sync += r1.hit_index.eq(req_index)
1280
1281         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1282         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1283                                 (req_op == Op.OP_STORE_HIT))
1284
1285         with m.If(req_op == Op.OP_BAD):
1286             sync += Display("Signalling ld/st error "
1287                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1288                             ~r0.mmu_req,r0.mmu_req,access_ok)
1289             sync += r1.ls_error.eq(~r0.mmu_req)
1290             sync += r1.mmu_error.eq(r0.mmu_req)
1291             sync += r1.cache_paradox.eq(access_ok)
1292         with m.Else():
1293             sync += r1.ls_error.eq(0)
1294             sync += r1.mmu_error.eq(0)
1295             sync += r1.cache_paradox.eq(0)
1296
1297         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1298
1299         # Record TLB hit information for updating TLB PLRU
1300         sync += r1.tlb_hit.eq(tlb_hit)
1301         sync += r1.tlb_hit_index.eq(tlb_req_index)
1302
1303     # Memory accesses are handled by this state machine:
1304     #
1305     #   * Cache load miss/reload (in conjunction with "rams")
1306     #   * Load hits for non-cachable forms
1307     #   * Stores (the collision case is handled in "rams")
1308     #
1309     # All wishbone requests generation is done here.
1310     # This machine operates at stage 1.
1311     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1312                     r0, replace_way,
1313                     req_hit_way, req_same_tag,
1314                     r0_valid, req_op, cache_tags, req_go, ra):
1315
1316         comb = m.d.comb
1317         sync = m.d.sync
1318         bus = self.bus
1319         d_in = self.d_in
1320
1321         req         = MemAccessRequest("mreq_ds")
1322
1323         req_row = Signal(ROW_BITS)
1324         req_idx = Signal(INDEX_BITS)
1325         req_tag = Signal(TAG_BITS)
1326         comb += req_idx.eq(get_index(req.real_addr))
1327         comb += req_row.eq(get_row(req.real_addr))
1328         comb += req_tag.eq(get_tag(req.real_addr))
1329
1330         sync += r1.use_forward1.eq(use_forward1_next)
1331         sync += r1.forward_sel.eq(0)
1332
1333         with m.If(use_forward1_next):
1334             sync += r1.forward_sel.eq(r1.req.byte_sel)
1335         with m.Elif(use_forward2_next):
1336             sync += r1.forward_sel.eq(r1.forward_sel1)
1337
1338         sync += r1.forward_data2.eq(r1.forward_data1)
1339         with m.If(r1.write_bram):
1340             sync += r1.forward_data1.eq(r1.req.data)
1341             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1342             sync += r1.forward_way1.eq(r1.req.hit_way)
1343             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1344             sync += r1.forward_valid1.eq(1)
1345         with m.Else():
1346             with m.If(r1.dcbz):
1347                 sync += r1.forward_data1.eq(0)
1348             with m.Else():
1349                 sync += r1.forward_data1.eq(bus.dat_r)
1350             sync += r1.forward_sel1.eq(~0) # all 1s
1351             sync += r1.forward_way1.eq(replace_way)
1352             sync += r1.forward_row1.eq(r1.store_row)
1353             sync += r1.forward_valid1.eq(0)
1354
1355         # One cycle pulses reset
1356         sync += r1.slow_valid.eq(0)
1357         sync += r1.write_bram.eq(0)
1358         sync += r1.inc_acks.eq(0)
1359         sync += r1.dec_acks.eq(0)
1360
1361         sync += r1.ls_valid.eq(0)
1362         # complete tlbies and TLB loads in the third cycle
1363         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1364
1365         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1366             with m.If(~r0.mmu_req):
1367                 sync += r1.ls_valid.eq(1)
1368             with m.Else():
1369                 sync += r1.mmu_done.eq(1)
1370
1371         with m.If(r1.write_tag):
1372             # Store new tag in selected way
1373             replace_way_onehot = Signal(NUM_WAYS)
1374             comb += replace_way_onehot.eq(1<<replace_way)
1375             for i in range(NUM_WAYS):
1376                 with m.If(replace_way_onehot[i]):
1377                     ct = Signal(TAG_RAM_WIDTH)
1378                     comb += ct.eq(cache_tags[r1.store_index].tag)
1379                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1380                     sync += cache_tags[r1.store_index].tag.eq(ct)
1381             sync += r1.store_way.eq(replace_way)
1382             sync += r1.write_tag.eq(0)
1383
1384         # Take request from r1.req if there is one there,
1385         # else from req_op, ra, etc.
1386         with m.If(r1.full):
1387             comb += req.eq(r1.req)
1388         with m.Else():
1389             comb += req.op.eq(req_op)
1390             comb += req.valid.eq(req_go)
1391             comb += req.mmu_req.eq(r0.mmu_req)
1392             comb += req.dcbz.eq(r0.req.dcbz)
1393             comb += req.real_addr.eq(ra)
1394
1395             with m.If(r0.req.dcbz):
1396                 # force data to 0 for dcbz
1397                 comb += req.data.eq(0)
1398             with m.Elif(r0.d_valid):
1399                 comb += req.data.eq(r0.req.data)
1400             with m.Else():
1401                 comb += req.data.eq(d_in.data)
1402
1403             # Select all bytes for dcbz
1404             # and for cacheable loads
1405             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1406                 comb += req.byte_sel.eq(~0) # all 1s
1407             with m.Else():
1408                 comb += req.byte_sel.eq(r0.req.byte_sel)
1409             comb += req.hit_way.eq(req_hit_way)
1410             comb += req.same_tag.eq(req_same_tag)
1411
1412             # Store the incoming request from r0,
1413             # if it is a slow request
1414             # Note that r1.full = 1 implies req_op = OP_NONE
1415             with m.If((req_op == Op.OP_LOAD_MISS)
1416                       | (req_op == Op.OP_LOAD_NC)
1417                       | (req_op == Op.OP_STORE_MISS)
1418                       | (req_op == Op.OP_STORE_HIT)):
1419                 sync += r1.req.eq(req)
1420                 sync += r1.full.eq(1)
1421
1422         # Main state machine
1423         with m.Switch(r1.state):
1424
1425             with m.Case(State.IDLE):
1426                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1427                 sync += r1.wb.sel.eq(req.byte_sel)
1428                 sync += r1.wb.dat.eq(req.data)
1429                 sync += r1.dcbz.eq(req.dcbz)
1430
1431                 # Keep track of our index and way
1432                 # for subsequent stores.
1433                 sync += r1.store_index.eq(req_idx)
1434                 sync += r1.store_row.eq(req_row)
1435                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1436                 sync += r1.reload_tag.eq(req_tag)
1437                 sync += r1.req.same_tag.eq(1)
1438
1439                 with m.If(req.op == Op.OP_STORE_HIT):
1440                     sync += r1.store_way.eq(req.hit_way)
1441
1442                 # Reset per-row valid bits,
1443                 # ready for handling OP_LOAD_MISS
1444                 for i in range(ROW_PER_LINE):
1445                     sync += r1.rows_valid[i].eq(0)
1446
1447                 with m.If(req_op != Op.OP_NONE):
1448                     sync += Display("cache op %d", req.op)
1449
1450                 with m.Switch(req.op):
1451                     with m.Case(Op.OP_LOAD_HIT):
1452                         # stay in IDLE state
1453                         pass
1454
1455                     with m.Case(Op.OP_LOAD_MISS):
1456                         sync += Display("cache miss real addr: %x " \
1457                                 "idx: %x tag: %x",
1458                                 req.real_addr, req_row, req_tag)
1459
1460                         # Start the wishbone cycle
1461                         sync += r1.wb.we.eq(0)
1462                         sync += r1.wb.cyc.eq(1)
1463                         sync += r1.wb.stb.eq(1)
1464
1465                         # Track that we had one request sent
1466                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1467                         sync += r1.write_tag.eq(1)
1468
1469                     with m.Case(Op.OP_LOAD_NC):
1470                         sync += r1.wb.cyc.eq(1)
1471                         sync += r1.wb.stb.eq(1)
1472                         sync += r1.wb.we.eq(0)
1473                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1474
1475                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1476                         with m.If(~req.dcbz):
1477                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1478                             sync += r1.acks_pending.eq(1)
1479                             sync += r1.full.eq(0)
1480                             sync += r1.slow_valid.eq(1)
1481
1482                             with m.If(~req.mmu_req):
1483                                 sync += r1.ls_valid.eq(1)
1484                             with m.Else():
1485                                 sync += r1.mmu_done.eq(1)
1486
1487                             with m.If(req.op == Op.OP_STORE_HIT):
1488                                 sync += r1.write_bram.eq(1)
1489                         with m.Else():
1490                             # dcbz is handled much like a load miss except
1491                             # that we are writing to memory instead of reading
1492                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1493
1494                             with m.If(req.op == Op.OP_STORE_MISS):
1495                                 sync += r1.write_tag.eq(1)
1496
1497                         sync += r1.wb.we.eq(1)
1498                         sync += r1.wb.cyc.eq(1)
1499                         sync += r1.wb.stb.eq(1)
1500
1501                     # OP_NONE and OP_BAD do nothing
1502                     # OP_BAD & OP_STCX_FAIL were
1503                     # handled above already
1504                     with m.Case(Op.OP_NONE):
1505                         pass
1506                     with m.Case(Op.OP_BAD):
1507                         pass
1508                     with m.Case(Op.OP_STCX_FAIL):
1509                         pass
1510
1511             with m.Case(State.RELOAD_WAIT_ACK):
1512                 ld_stbs_done = Signal()
1513                 # Requests are all sent if stb is 0
1514                 comb += ld_stbs_done.eq(~r1.wb.stb)
1515
1516                 # If we are still sending requests, was one accepted?
1517                 with m.If((~bus.stall) & r1.wb.stb):
1518                     # That was the last word?  We are done sending.
1519                     # Clear stb and set ld_stbs_done so we can handle an
1520                     # eventual last ack on the same cycle.
1521                     # sigh - reconstruct wb adr with 3 extra 0s at front
1522                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1523                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1524                         sync += r1.wb.stb.eq(0)
1525                         comb += ld_stbs_done.eq(1)
1526
1527                     # Calculate the next row address in the current cache line
1528                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1529                     comb += row.eq(r1.wb.adr)
1530                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1531
1532                 # Incoming acks processing
1533                 sync += r1.forward_valid1.eq(bus.ack)
1534                 with m.If(bus.ack):
1535                     srow = Signal(ROW_LINE_BITS)
1536                     comb += srow.eq(r1.store_row)
1537                     sync += r1.rows_valid[srow].eq(1)
1538
1539                     # If this is the data we were looking for,
1540                     # we can complete the request next cycle.
1541                     # Compare the whole address in case the
1542                     # request in r1.req is not the one that
1543                     # started this refill.
1544                     with m.If(req.valid & r1.req.same_tag &
1545                               ((r1.dcbz & r1.req.dcbz) |
1546                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1547                                 (r1.store_row == get_row(req.real_addr))):
1548                         sync += r1.full.eq(0)
1549                         sync += r1.slow_valid.eq(1)
1550                         with m.If(~r1.mmu_req):
1551                             sync += r1.ls_valid.eq(1)
1552                         with m.Else():
1553                             sync += r1.mmu_done.eq(1)
1554                         sync += r1.forward_sel.eq(~0) # all 1s
1555                         sync += r1.use_forward1.eq(1)
1556
1557                     # Check for completion
1558                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1559                                                       r1.end_row_ix)):
1560                         # Complete wishbone cycle
1561                         sync += r1.wb.cyc.eq(0)
1562
1563                         # Cache line is now valid
1564                         cv = Signal(INDEX_BITS)
1565                         comb += cv.eq(cache_tags[r1.store_index].valid)
1566                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1567                         sync += cache_tags[r1.store_index].valid.eq(cv)
1568
1569                         sync += r1.state.eq(State.IDLE)
1570                         sync += Display("cache valid set %x "
1571                                         "idx %d way %d",
1572                                          cv, r1.store_index, r1.store_way)
1573
1574                     # Increment store row counter
1575                     sync += r1.store_row.eq(next_row(r1.store_row))
1576
1577             with m.Case(State.STORE_WAIT_ACK):
1578                 st_stbs_done = Signal()
1579                 acks        = Signal(3)
1580                 adjust_acks = Signal(3)
1581
1582                 comb += st_stbs_done.eq(~r1.wb.stb)
1583                 comb += acks.eq(r1.acks_pending)
1584
1585                 with m.If(r1.inc_acks != r1.dec_acks):
1586                     with m.If(r1.inc_acks):
1587                         comb += adjust_acks.eq(acks + 1)
1588                     with m.Else():
1589                         comb += adjust_acks.eq(acks - 1)
1590                 with m.Else():
1591                     comb += adjust_acks.eq(acks)
1592
1593                 sync += r1.acks_pending.eq(adjust_acks)
1594
1595                 # Clear stb when slave accepted request
1596                 with m.If(~bus.stall):
1597                     # See if there is another store waiting
1598                     # to be done which is in the same real page.
1599                     with m.If(req.valid):
1600                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1601                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1602                         sync += r1.wb.dat.eq(req.data)
1603                         sync += r1.wb.sel.eq(req.byte_sel)
1604
1605                     with m.If((adjust_acks < 7) & req.same_tag &
1606                                 ((req.op == Op.OP_STORE_MISS)
1607                                  | (req.op == Op.OP_STORE_HIT))):
1608                         sync += r1.wb.stb.eq(1)
1609                         comb += st_stbs_done.eq(0)
1610
1611                         with m.If(req.op == Op.OP_STORE_HIT):
1612                             sync += r1.write_bram.eq(1)
1613                         sync += r1.full.eq(0)
1614                         sync += r1.slow_valid.eq(1)
1615
1616                         # Store requests never come from the MMU
1617                         sync += r1.ls_valid.eq(1)
1618                         comb += st_stbs_done.eq(0)
1619                         sync += r1.inc_acks.eq(1)
1620                     with m.Else():
1621                         sync += r1.wb.stb.eq(0)
1622                         comb += st_stbs_done.eq(1)
1623
1624                 # Got ack ? See if complete.
1625                 with m.If(bus.ack):
1626                     with m.If(st_stbs_done & (adjust_acks == 1)):
1627                         sync += r1.state.eq(State.IDLE)
1628                         sync += r1.wb.cyc.eq(0)
1629                         sync += r1.wb.stb.eq(0)
1630                     sync += r1.dec_acks.eq(1)
1631
1632             with m.Case(State.NC_LOAD_WAIT_ACK):
1633                 # Clear stb when slave accepted request
1634                 with m.If(~bus.stall):
1635                     sync += r1.wb.stb.eq(0)
1636
1637                 # Got ack ? complete.
1638                 with m.If(bus.ack):
1639                     sync += r1.state.eq(State.IDLE)
1640                     sync += r1.full.eq(0)
1641                     sync += r1.slow_valid.eq(1)
1642
1643                     with m.If(~r1.mmu_req):
1644                         sync += r1.ls_valid.eq(1)
1645                     with m.Else():
1646                         sync += r1.mmu_done.eq(1)
1647
1648                     sync += r1.forward_sel.eq(~0) # all 1s
1649                     sync += r1.use_forward1.eq(1)
1650                     sync += r1.wb.cyc.eq(0)
1651                     sync += r1.wb.stb.eq(0)
1652
1653     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1654
1655         sync = m.d.sync
1656         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1657
1658         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1659                                stall_out, req_op[:3], d_out.valid, d_out.error,
1660                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1661                                r1.real_adr[3:6]))
1662
1663     def elaborate(self, platform):
1664
1665         m = Module()
1666         comb = m.d.comb
1667         d_in = self.d_in
1668
1669         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1670         cache_tags       = CacheTagArray()
1671         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1672
1673         # TODO attribute ram_style : string;
1674         # TODO attribute ram_style of cache_tags : signal is "distributed";
1675
1676         """note: these are passed to nmigen.hdl.Memory as "attributes".
1677            don't know how, just that they are.
1678         """
1679         # TODO attribute ram_style of
1680         #  dtlb_tags : signal is "distributed";
1681         # TODO attribute ram_style of
1682         #  dtlb_ptes : signal is "distributed";
1683
1684         r0      = RegStage0("r0")
1685         r0_full = Signal()
1686
1687         r1 = RegStage1("r1")
1688
1689         reservation = Reservation()
1690
1691         # Async signals on incoming request
1692         req_index    = Signal(INDEX_BITS)
1693         req_row      = Signal(ROW_BITS)
1694         req_hit_way  = Signal(WAY_BITS)
1695         req_tag      = Signal(TAG_BITS)
1696         req_op       = Signal(Op)
1697         req_data     = Signal(64)
1698         req_same_tag = Signal()
1699         req_go       = Signal()
1700
1701         early_req_row     = Signal(ROW_BITS)
1702
1703         cancel_store      = Signal()
1704         set_rsrv          = Signal()
1705         clear_rsrv        = Signal()
1706
1707         r0_valid          = Signal()
1708         r0_stall          = Signal()
1709
1710         use_forward1_next = Signal()
1711         use_forward2_next = Signal()
1712
1713         cache_out_row     = Signal(WB_DATA_BITS)
1714
1715         plru_victim       = Signal(WAY_BITS)
1716         replace_way       = Signal(WAY_BITS)
1717
1718         # Wishbone read/write/cache write formatting signals
1719         bus_sel           = Signal(8)
1720
1721         # TLB signals
1722         tlb_way       = TLBRecord("tlb_way")
1723         tlb_req_index = Signal(TLB_SET_BITS)
1724         tlb_hit       = TLBHit("tlb_hit")
1725         pte           = Signal(TLB_PTE_BITS)
1726         ra            = Signal(REAL_ADDR_BITS)
1727         valid_ra      = Signal()
1728         perm_attr     = PermAttr("dc_perms")
1729         rc_ok         = Signal()
1730         perm_ok       = Signal()
1731         access_ok     = Signal()
1732
1733         tlb_plru_victim = Signal(TLB_WAY_BITS)
1734
1735         # we don't yet handle collisions between loadstore1 requests
1736         # and MMU requests
1737         comb += self.m_out.stall.eq(0)
1738
1739         # Hold off the request in r0 when r1 has an uncompleted request
1740         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1741         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1742         comb += self.stall_out.eq(r0_stall)
1743
1744         # deal with litex not doing wishbone pipeline mode
1745         # XXX in wrong way.  FIFOs are needed in the SRAM test
1746         # so that stb/ack match up. same thing done in icache.py
1747         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1748
1749         # Wire up wishbone request latch out of stage 1
1750         comb += self.bus.we.eq(r1.wb.we)
1751         comb += self.bus.adr.eq(r1.wb.adr)
1752         comb += self.bus.sel.eq(r1.wb.sel)
1753         comb += self.bus.stb.eq(r1.wb.stb)
1754         comb += self.bus.dat_w.eq(r1.wb.dat)
1755         comb += self.bus.cyc.eq(r1.wb.cyc)
1756
1757         # create submodule TLBUpdate
1758         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1759
1760         # call sub-functions putting everything together, using shared
1761         # signals established above
1762         self.stage_0(m, r0, r1, r0_full)
1763         self.tlb_read(m, r0_stall, tlb_way)
1764         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1765                         tlb_way,
1766                         pte, tlb_hit, valid_ra, perm_attr, ra)
1767         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1768                         tlb_hit, tlb_plru_victim,
1769                         tlb_way)
1770         self.maybe_plrus(m, r1, plru_victim)
1771         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1772         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1773         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1774                            r0_valid, r1, cache_tags, replace_way,
1775                            use_forward1_next, use_forward2_next,
1776                            req_hit_way, plru_victim, rc_ok, perm_attr,
1777                            valid_ra, perm_ok, access_ok, req_op, req_go,
1778                            tlb_hit, tlb_way, cache_tag_set,
1779                            cancel_store, req_same_tag, r0_stall, early_req_row)
1780         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1781                            r0_valid, r0, reservation)
1782         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1783                            reservation, r0)
1784         self.writeback_control(m, r1, cache_out_row)
1785         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1786         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1787                         req_hit_way, req_index, req_tag, access_ok,
1788                         tlb_hit, tlb_req_index)
1789         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1790                     r0, replace_way,
1791                     req_hit_way, req_same_tag,
1792                          r0_valid, req_op, cache_tags, req_go, ra)
1793         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1794
1795         return m
1796
1797
1798 if __name__ == '__main__':
1799     dut = DCache()
1800     vl = rtlil.convert(dut, ports=[])
1801     with open("test_dcache.il", "w") as f:
1802         f.write(vl)