src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record, Memory)
  29 from nmutil.util import Display
  30 from nmigen.lib.coding import Decoder
  31
  32 from copy import deepcopy
  33 from random import randint, seed
  34
  35 from nmigen_soc.wishbone.bus import Interface
  36
  37 from nmigen.cli import main
  38 from nmutil.iocontrol import RecordObject
  39 from nmigen.utils import log2_int
  40 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  41                                      DCacheToLoadStore1Type,
  42                                      MMUToDCacheType,
  43                                      DCacheToMMUType)
  44
  45 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  46                                 WBAddrType, WBDataType, WBSelType,
  47                                 WBMasterOut, WBSlaveOut,
  48                                 WBMasterOutVector, WBSlaveOutVector,
  49                                 WBIOMasterOut, WBIOSlaveOut)
  50
  51 from soc.experiment.cache_ram import CacheRam
  52 #from soc.experiment.plru import PLRU
  53 from nmutil.plru import PLRU, PLRUs
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmigen.cli import rtlil
  59
  60 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  61 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  62 from nmutil.sim_tmp_alternative import Simulator
  63
  64 from nmutil.util import wrap
  65
  66
  67 # TODO: make these parameters of DCache at some point
  68 LINE_SIZE = 64    # Line size in bytes
  69 NUM_LINES = 16    # Number of lines in a set
  70 NUM_WAYS = 4      # Number of ways
  71 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  72 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  73 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  74 LOG_LENGTH = 0    # Non-zero to enable log data collection
  75
  76 # BRAM organisation: We never access more than
  77 #     -- WB_DATA_BITS at a time so to save
  78 #     -- resources we make the array only that wide, and
  79 #     -- use consecutive indices to make a cache "line"
  80 #     --
  81 #     -- ROW_SIZE is the width in bytes of the BRAM
  82 #     -- (based on WB, so 64-bits)
  83 ROW_SIZE = WB_DATA_BITS // 8;
  84
  85 # ROW_PER_LINE is the number of row (wishbone
  86 # transactions) in a line
  87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  88
  89 # BRAM_ROWS is the number of rows in BRAM needed
  90 # to represent the full dcache
  91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  92
  93 print ("ROW_SIZE", ROW_SIZE)
  94 print ("ROW_PER_LINE", ROW_PER_LINE)
  95 print ("BRAM_ROWS", BRAM_ROWS)
  96 print ("NUM_WAYS", NUM_WAYS)
  97
  98 # Bit fields counts in the address
  99
 100 # REAL_ADDR_BITS is the number of real address
 101 # bits that we store
 102 REAL_ADDR_BITS = 56
 103
 104 # ROW_BITS is the number of bits to select a row
 105 ROW_BITS = log2_int(BRAM_ROWS)
 106
 107 # ROW_LINE_BITS is the number of bits to select
 108 # a row within a line
 109 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 110
 111 # LINE_OFF_BITS is the number of bits for
 112 # the offset in a cache line
 113 LINE_OFF_BITS = log2_int(LINE_SIZE)
 114
 115 # ROW_OFF_BITS is the number of bits for
 116 # the offset in a row
 117 ROW_OFF_BITS = log2_int(ROW_SIZE)
 118
 119 # INDEX_BITS is the number if bits to
 120 # select a cache line
 121 INDEX_BITS = log2_int(NUM_LINES)
 122
 123 # SET_SIZE_BITS is the log base 2 of the set size
 124 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 125
 126 # TAG_BITS is the number of bits of
 127 # the tag part of the address
 128 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 129
 130 # TAG_WIDTH is the width in bits of each way of the tag RAM
 131 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 132
 133 # WAY_BITS is the number of bits to select a way
 134 WAY_BITS = log2_int(NUM_WAYS)
 135
 136 # Example of layout for 32 lines of 64 bytes:
 137 layout = f"""\
 138   DCache Layout:
 139  |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
 140   ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
 141   ..  tag    |index|  line  |
 142   ..         |   row   |    |
 143   ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
 144   ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
 145   ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
 146   ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
 147   ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
 148   .. --------|              | TAG_BITS      ({TAG_BITS})
 149 """
 150 print (layout)
 151 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 152             (TAG_BITS, INDEX_BITS, ROW_BITS,
 153              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 154 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 155 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 156 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 157
 158 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 159
 160 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 161 print ("    TAG_WIDTH", TAG_WIDTH)
 162 print ("     NUM_WAYS", NUM_WAYS)
 163
 164 def CacheTagArray():
 165     tag_layout = [('valid', NUM_WAYS),
 166                   ('tag', TAG_RAM_WIDTH),
 167                  ]
 168     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 169
 170 def RowPerLineValidArray():
 171     return Array(Signal(name="rows_valid%d" % x) \
 172                         for x in range(ROW_PER_LINE))
 173
 174 # L1 TLB
 175 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 176 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 177 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 178 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 179 TLB_PTE_BITS     = 64
 180 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 181
 182 def ispow2(x):
 183     return (1<<log2_int(x, False)) == x
 184
 185 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 186 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 187 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 188 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 189 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 190 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 191         "geometry bits don't add up"
 192 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 193         "geometry bits don't add up"
 194 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 195          "geometry bits don't add up"
 196 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 197 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 198
 199
 200 def TLBHit(name):
 201     return Record([('valid', 1),
 202                    ('way', TLB_WAY_BITS)], name=name)
 203
 204 def TLBTagEAArray():
 205     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 206                 for x in range (TLB_NUM_WAYS))
 207
 208 def TLBRecord(name):
 209     tlb_layout = [('valid', TLB_NUM_WAYS),
 210                   ('tag', TLB_TAG_WAY_BITS),
 211                   ('pte', TLB_PTE_WAY_BITS)
 212                  ]
 213     return Record(tlb_layout, name=name)
 214
 215 def TLBValidArray():
 216     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 217                         for x in range(TLB_SET_SIZE))
 218
 219 def HitWaySet():
 220     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 221                         for x in range(TLB_NUM_WAYS))
 222
 223 # Cache RAM interface
 224 def CacheRamOut():
 225     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 226                  for x in range(NUM_WAYS))
 227
 228 # PLRU output interface
 229 def PLRUOut():
 230     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 231                 for x in range(NUM_LINES))
 232
 233 # TLB PLRU output interface
 234 def TLBPLRUOut():
 235     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 236                 for x in range(TLB_SET_SIZE))
 237
 238 # Helper functions to decode incoming requests
 239 #
 240 # Return the cache line index (tag index) for an address
 241 def get_index(addr):
 242     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 243
 244 # Return the cache row index (data memory) for an address
 245 def get_row(addr):
 246     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 247
 248 # Return the index of a row within a line
 249 def get_row_of_line(row):
 250     return row[:ROW_BITS][:ROW_LINE_BITS]
 251
 252 # Returns whether this is the last row of a line
 253 def is_last_row_addr(addr, last):
 254     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 255
 256 # Returns whether this is the last row of a line
 257 def is_last_row(row, last):
 258     return get_row_of_line(row) == last
 259
 260 # Return the next row in the current cache line. We use a
 261 # dedicated function in order to limit the size of the
 262 # generated adder to be only the bits within a cache line
 263 # (3 bits with default settings)
 264 def next_row(row):
 265     row_v = row[0:ROW_LINE_BITS] + 1
 266     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 267
 268 # Get the tag value from the address
 269 def get_tag(addr):
 270     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 271
 272 # Read a tag from a tag memory row
 273 def read_tag(way, tagset):
 274     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 275
 276 # Read a TLB tag from a TLB tag memory row
 277 def read_tlb_tag(way, tags):
 278     return tags.word_select(way, TLB_EA_TAG_BITS)
 279
 280 # Write a TLB tag to a TLB tag memory row
 281 def write_tlb_tag(way, tags, tag):
 282     return read_tlb_tag(way, tags).eq(tag)
 283
 284 # Read a PTE from a TLB PTE memory row
 285 def read_tlb_pte(way, ptes):
 286     return ptes.word_select(way, TLB_PTE_BITS)
 287
 288 def write_tlb_pte(way, ptes, newpte):
 289     return read_tlb_pte(way, ptes).eq(newpte)
 290
 291
 292 # Record for storing permission, attribute, etc. bits from a PTE
 293 class PermAttr(RecordObject):
 294     def __init__(self, name=None):
 295         super().__init__(name=name)
 296         self.reference = Signal()
 297         self.changed   = Signal()
 298         self.nocache   = Signal()
 299         self.priv      = Signal()
 300         self.rd_perm   = Signal()
 301         self.wr_perm   = Signal()
 302
 303
 304 def extract_perm_attr(pte):
 305     pa = PermAttr()
 306     return pa;
 307
 308
 309 # Type of operation on a "valid" input
 310 @unique
 311 class Op(Enum):
 312     OP_NONE       = 0
 313     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 314     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 315     OP_LOAD_HIT   = 3 # Cache hit on load
 316     OP_LOAD_MISS  = 4 # Load missing cache
 317     OP_LOAD_NC    = 5 # Non-cachable load
 318     OP_STORE_HIT  = 6 # Store hitting cache
 319     OP_STORE_MISS = 7 # Store missing cache
 320
 321
 322 # Cache state machine
 323 @unique
 324 class State(Enum):
 325     IDLE             = 0 # Normal load hit processing
 326     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 327     STORE_WAIT_ACK   = 2 # Store wait ack
 328     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 329
 330
 331 # Dcache operations:
 332 #
 333 # In order to make timing, we use the BRAMs with
 334 # an output buffer, which means that the BRAM
 335 # output is delayed by an extra cycle.
 336 #
 337 # Thus, the dcache has a 2-stage internal pipeline
 338 # for cache hits with no stalls.
 339 #
 340 # All other operations are handled via stalling
 341 # in the first stage.
 342 #
 343 # The second stage can thus complete a hit at the same
 344 # time as the first stage emits a stall for a complex op.
 345 #
 346 # Stage 0 register, basically contains just the latched request
 347
 348 class RegStage0(RecordObject):
 349     def __init__(self, name=None):
 350         super().__init__(name=name)
 351         self.req     = LoadStore1ToDCacheType(name="lsmem")
 352         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 353         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 354         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 355         self.mmu_req = Signal() # indicates source of request
 356         self.d_valid = Signal() # indicates req.data is valid now
 357
 358
 359 class MemAccessRequest(RecordObject):
 360     def __init__(self, name=None):
 361         super().__init__(name=name)
 362         self.op        = Signal(Op)
 363         self.valid     = Signal()
 364         self.dcbz      = Signal()
 365         self.real_addr = Signal(REAL_ADDR_BITS)
 366         self.data      = Signal(64)
 367         self.byte_sel  = Signal(8)
 368         self.hit_way   = Signal(WAY_BITS)
 369         self.same_tag  = Signal()
 370         self.mmu_req   = Signal()
 371
 372
 373 # First stage register, contains state for stage 1 of load hits
 374 # and for the state machine used by all other operations
 375 class RegStage1(RecordObject):
 376     def __init__(self, name=None):
 377         super().__init__(name=name)
 378         # Info about the request
 379         self.full             = Signal() # have uncompleted request
 380         self.mmu_req          = Signal() # request is from MMU
 381         self.req              = MemAccessRequest(name="reqmem")
 382
 383         # Cache hit state
 384         self.hit_way          = Signal(WAY_BITS)
 385         self.hit_load_valid   = Signal()
 386         self.hit_index        = Signal(INDEX_BITS)
 387         self.cache_hit        = Signal()
 388
 389         # TLB hit state
 390         self.tlb_hit          = TLBHit("tlb_hit")
 391         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 392
 393         # 2-stage data buffer for data forwarded from writes to reads
 394         self.forward_data1    = Signal(64)
 395         self.forward_data2    = Signal(64)
 396         self.forward_sel1     = Signal(8)
 397         self.forward_valid1   = Signal()
 398         self.forward_way1     = Signal(WAY_BITS)
 399         self.forward_row1     = Signal(ROW_BITS)
 400         self.use_forward1     = Signal()
 401         self.forward_sel      = Signal(8)
 402
 403         # Cache miss state (reload state machine)
 404         self.state            = Signal(State)
 405         self.dcbz             = Signal()
 406         self.write_bram       = Signal()
 407         self.write_tag        = Signal()
 408         self.slow_valid       = Signal()
 409         self.wb               = WBMasterOut("wb")
 410         self.reload_tag       = Signal(TAG_BITS)
 411         self.store_way        = Signal(WAY_BITS)
 412         self.store_row        = Signal(ROW_BITS)
 413         self.store_index      = Signal(INDEX_BITS)
 414         self.end_row_ix       = Signal(ROW_LINE_BITS)
 415         self.rows_valid       = RowPerLineValidArray()
 416         self.acks_pending     = Signal(3)
 417         self.inc_acks         = Signal()
 418         self.dec_acks         = Signal()
 419
 420         # Signals to complete (possibly with error)
 421         self.ls_valid         = Signal()
 422         self.ls_error         = Signal()
 423         self.mmu_done         = Signal()
 424         self.mmu_error        = Signal()
 425         self.cache_paradox    = Signal()
 426
 427         # Signal to complete a failed stcx.
 428         self.stcx_fail        = Signal()
 429
 430
 431 # Reservation information
 432 class Reservation(RecordObject):
 433     def __init__(self):
 434         super().__init__()
 435         self.valid = Signal()
 436         self.addr  = Signal(64-LINE_OFF_BITS)
 437
 438
 439 class DTLBUpdate(Elaboratable):
 440     def __init__(self):
 441         self.tlbie    = Signal()
 442         self.tlbwe    = Signal()
 443         self.doall    = Signal()
 444         self.tlb_hit     = TLBHit("tlb_hit")
 445         self.tlb_req_index = Signal(TLB_SET_BITS)
 446
 447         self.repl_way        = Signal(TLB_WAY_BITS)
 448         self.eatag           = Signal(TLB_EA_TAG_BITS)
 449         self.pte_data        = Signal(TLB_PTE_BITS)
 450
 451         # read from dtlb array
 452         self.tlb_read       = Signal()
 453         self.tlb_read_index = Signal(TLB_SET_BITS)
 454         self.tlb_way        = TLBRecord("o_tlb_way")
 455
 456     def elaborate(self, platform):
 457         m = Module()
 458         comb = m.d.comb
 459         sync = m.d.sync
 460
 461         # there are 3 parts to this:
 462         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 463         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 464         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 465         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 466         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 467         # hmmm....
 468
 469         dtlb_valid = TLBValidArray()
 470         tlb_req_index = self.tlb_req_index
 471
 472         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 473         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 474         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 475         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 476         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 477         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 478
 479         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 480         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 481         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 482         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 483                                     granularity=TLB_EA_TAG_BITS)
 484
 485         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 486         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 487         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 488                                     granularity=TLB_PTE_BITS)
 489
 490         # commented out for now, can be put in if Memory.reset can be
 491         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 492         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 493         #m.submodules.rd_valid = rd_valid = validm.read_port()
 494         #m.submodules.wr_valid = wr_valid = validm.write_port(
 495                                     #granularity=1)
 496
 497         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 498         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 499         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 500         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 501         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 502         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 503         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 504
 505         updated  = Signal()
 506         v_updated  = Signal()
 507         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 508         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 509         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 510         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 511
 512         comb += dv.eq(dtlb_valid[tlb_req_index])
 513         comb += db_out.eq(dv)
 514
 515         with m.If(self.tlbie & self.doall):
 516             # clear all valid bits at once
 517             # XXX hmmm, validm _could_ use Memory reset here...
 518             for i in range(TLB_SET_SIZE):
 519                 sync += dtlb_valid[i].eq(0)
 520         with m.Elif(self.tlbie):
 521             # invalidate just the hit_way
 522             with m.If(self.tlb_hit.valid):
 523                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 524                 comb += v_updated.eq(1)
 525         with m.Elif(self.tlbwe):
 526             # write to the requested tag and PTE
 527             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 528             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 529             # set valid bit
 530             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 531
 532             comb += updated.eq(1)
 533             comb += v_updated.eq(1)
 534
 535         # above, sometimes valid is requested to be updated but data not
 536         # therefore split them out, here.  note the granularity thing matches
 537         # with the shift-up of the eatag/pte_data into the correct TLB way.
 538         # thus is it not necessary to write the entire lot, just the portion
 539         # being altered: hence writing the *old* copy of the row is not needed
 540         with m.If(updated): # PTE and TAG to be written
 541             comb += wr_pteway.data.eq(pb_out)
 542             comb += wr_pteway.en.eq(1<<self.repl_way)
 543             comb += wr_tagway.data.eq(tb_out)
 544             comb += wr_tagway.en.eq(1<<self.repl_way)
 545         with m.If(v_updated): # Valid to be written
 546             sync += dtlb_valid[tlb_req_index].eq(db_out)
 547             #comb += wr_valid.data.eq(db_out)
 548             #comb += wr_valid.en.eq(1<<self.repl_way)
 549
 550         # select one TLB way, use a register here
 551         r_tlb_way        = TLBRecord("r_tlb_way")
 552         r_delay = Signal()
 553         sync += r_delay.eq(self.tlb_read)
 554         with m.If(self.tlb_read):
 555             sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 556         with m.If(r_delay):
 557             # on one clock delay, output the contents of the read port(s)
 558             # comb += self.tlb_way.valid.eq(rd_valid.data)
 559             comb += self.tlb_way.tag.eq(rd_tagway.data)
 560             comb += self.tlb_way.pte.eq(rd_pteway.data)
 561             # and also capture the (delayed) output...
 562             #sync += r_tlb_way.valid.eq(rd_valid.data)
 563             sync += r_tlb_way.tag.eq(rd_tagway.data)
 564             sync += r_tlb_way.pte.eq(rd_pteway.data)
 565         with m.Else():
 566             # ... so that the register can output it when no read is requested
 567             # it's rather overkill but better to be safe than sorry
 568             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 569             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 570             #comb += self.tlb_way.eq(r_tlb_way)
 571
 572         return m
 573
 574
 575 class DCachePendingHit(Elaboratable):
 576
 577     def __init__(self, tlb_way,
 578                       cache_i_validdx, cache_tag_set,
 579                     req_addr):
 580
 581         self.go          = Signal()
 582         self.virt_mode   = Signal()
 583         self.is_hit      = Signal()
 584         self.tlb_hit      = TLBHit("tlb_hit")
 585         self.hit_way     = Signal(WAY_BITS)
 586         self.rel_match   = Signal()
 587         self.req_index   = Signal(INDEX_BITS)
 588         self.reload_tag  = Signal(TAG_BITS)
 589
 590         self.tlb_way = tlb_way
 591         self.cache_i_validdx = cache_i_validdx
 592         self.cache_tag_set = cache_tag_set
 593         self.req_addr = req_addr
 594
 595     def elaborate(self, platform):
 596         m = Module()
 597         comb = m.d.comb
 598         sync = m.d.sync
 599
 600         go = self.go
 601         virt_mode = self.virt_mode
 602         is_hit = self.is_hit
 603         tlb_way = self.tlb_way
 604         cache_i_validdx = self.cache_i_validdx
 605         cache_tag_set = self.cache_tag_set
 606         req_addr = self.req_addr
 607         tlb_hit = self.tlb_hit
 608         hit_way = self.hit_way
 609         rel_match = self.rel_match
 610         req_index = self.req_index
 611         reload_tag = self.reload_tag
 612
 613         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 614                                   for i in range(TLB_NUM_WAYS))
 615         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 616                                     for i in range(TLB_NUM_WAYS))
 617         hit_way_set = HitWaySet()
 618
 619         # Test if pending request is a hit on any way
 620         # In order to make timing in virtual mode,
 621         # when we are using the TLB, we compare each
 622         # way with each of the real addresses from each way of
 623         # the TLB, and then decide later which match to use.
 624
 625         with m.If(virt_mode):
 626             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 627                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 628                 s_hit       = Signal()
 629                 s_pte       = Signal(TLB_PTE_BITS)
 630                 s_ra        = Signal(REAL_ADDR_BITS)
 631                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 632                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 633                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 634                 comb += s_tag.eq(get_tag(s_ra))
 635
 636                 for i in range(NUM_WAYS): # way_t
 637                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 638                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 639                                   (read_tag(i, cache_tag_set) == s_tag)
 640                                   & (tlb_way.valid[j]))
 641                     with m.If(is_tag_hit):
 642                         comb += hit_way_set[j].eq(i)
 643                         comb += s_hit.eq(1)
 644                 comb += hit_set[j].eq(s_hit)
 645                 with m.If(s_tag == reload_tag):
 646                     comb += rel_matches[j].eq(1)
 647             with m.If(tlb_hit.valid):
 648                 comb += is_hit.eq(hit_set[tlb_hit.way])
 649                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 650                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 651         with m.Else():
 652             s_tag       = Signal(TAG_BITS)
 653             comb += s_tag.eq(get_tag(req_addr))
 654             for i in range(NUM_WAYS): # way_t
 655                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 656                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 657                           (read_tag(i, cache_tag_set) == s_tag))
 658                 with m.If(is_tag_hit):
 659                     comb += hit_way.eq(i)
 660                     comb += is_hit.eq(1)
 661             with m.If(s_tag == reload_tag):
 662                 comb += rel_match.eq(1)
 663
 664         return m
 665
 666
 667 class DCache(Elaboratable):
 668     """Set associative dcache write-through
 669
 670     TODO (in no specific order):
 671     * See list in icache.vhdl
 672     * Complete load misses on the cycle when WB data comes instead of
 673       at the end of line (this requires dealing with requests coming in
 674       while not idle...)
 675     """
 676     def __init__(self):
 677         self.d_in      = LoadStore1ToDCacheType("d_in")
 678         self.d_out     = DCacheToLoadStore1Type("d_out")
 679
 680         self.m_in      = MMUToDCacheType("m_in")
 681         self.m_out     = DCacheToMMUType("m_out")
 682
 683         self.stall_out = Signal()
 684
 685         # standard naming (wired to non-standard for compatibility)
 686         self.bus = Interface(addr_width=32,
 687                             data_width=64,
 688                             granularity=8,
 689                             features={'stall'},
 690                             alignment=0,
 691                             name="dcache")
 692
 693         self.log_out   = Signal(20)
 694
 695     def stage_0(self, m, r0, r1, r0_full):
 696         """Latch the request in r0.req as long as we're not stalling
 697         """
 698         comb = m.d.comb
 699         sync = m.d.sync
 700         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 701
 702         r = RegStage0("stage0")
 703
 704         # TODO, this goes in unit tests and formal proofs
 705         with m.If(d_in.valid & m_in.valid):
 706             sync += Display("request collision loadstore vs MMU")
 707
 708         with m.If(m_in.valid):
 709             comb += r.req.valid.eq(1)
 710             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 711             comb += r.req.dcbz.eq(0)
 712             comb += r.req.nc.eq(0)
 713             comb += r.req.reserve.eq(0)
 714             comb += r.req.virt_mode.eq(0)
 715             comb += r.req.priv_mode.eq(1)
 716             comb += r.req.addr.eq(m_in.addr)
 717             comb += r.req.data.eq(m_in.pte)
 718             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 719             comb += r.tlbie.eq(m_in.tlbie)
 720             comb += r.doall.eq(m_in.doall)
 721             comb += r.tlbld.eq(m_in.tlbld)
 722             comb += r.mmu_req.eq(1)
 723             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 724                                  m_in.addr, m_in.pte, r.req.load)
 725
 726         with m.Else():
 727             comb += r.req.eq(d_in)
 728             comb += r.req.data.eq(0)
 729             comb += r.tlbie.eq(0)
 730             comb += r.doall.eq(0)
 731             comb += r.tlbld.eq(0)
 732             comb += r.mmu_req.eq(0)
 733         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 734             sync += r0.eq(r)
 735             sync += r0_full.eq(r.req.valid)
 736             # Sample data the cycle after a request comes in from loadstore1.
 737             # If another request has come in already then the data will get
 738             # put directly into req.data below.
 739             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 740                      ~r0.mmu_req):
 741                 sync += r0.req.data.eq(d_in.data)
 742                 sync += r0.d_valid.eq(1)
 743         with m.If(d_in.valid):
 744             m.d.sync += Display("    DCACHE req cache "
 745                                 "virt %d addr %x data %x ld %d",
 746                                  r.req.virt_mode, r.req.addr,
 747                                  r.req.data, r.req.load)
 748
 749     def tlb_read(self, m, r0_stall, tlb_way):
 750         """TLB
 751         Operates in the second cycle on the request latched in r0.req.
 752         TLB updates write the entry at the end of the second cycle.
 753         """
 754         comb = m.d.comb
 755         sync = m.d.sync
 756         m_in, d_in = self.m_in, self.d_in
 757
 758         addrbits = Signal(TLB_SET_BITS)
 759
 760         amin = TLB_LG_PGSZ
 761         amax = TLB_LG_PGSZ + TLB_SET_BITS
 762
 763         with m.If(m_in.valid):
 764             comb += addrbits.eq(m_in.addr[amin : amax])
 765         with m.Else():
 766             comb += addrbits.eq(d_in.addr[amin : amax])
 767
 768         # If we have any op and the previous op isn't finished,
 769         # then keep the same output for next cycle.
 770         d = self.dtlb_update
 771         comb += d.tlb_read_index.eq(addrbits)
 772         comb += d.tlb_read.eq(~r0_stall)
 773         comb += tlb_way.eq(d.tlb_way)
 774
 775     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 776         """Generate TLB PLRUs
 777         """
 778         comb = m.d.comb
 779         sync = m.d.sync
 780
 781         if TLB_NUM_WAYS == 0:
 782             return
 783
 784         # suite of PLRUs with a selection and output mechanism
 785         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 786         m.submodules.tlb_plrus = tlb_plrus
 787         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 788         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 789         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 790         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 791         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 792
 793     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 794                    tlb_way,
 795                    pte, tlb_hit, valid_ra, perm_attr, ra):
 796
 797         comb = m.d.comb
 798
 799         hitway = Signal(TLB_WAY_BITS)
 800         hit    = Signal()
 801         eatag  = Signal(TLB_EA_TAG_BITS)
 802
 803         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 804         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 805         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 806
 807         for i in range(TLB_NUM_WAYS):
 808             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 809             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 810             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 811             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 812             with m.If(is_tag_hit):
 813                 comb += hitway.eq(i)
 814                 comb += hit.eq(1)
 815
 816         comb += tlb_hit.valid.eq(hit & r0_valid)
 817         comb += tlb_hit.way.eq(hitway)
 818
 819         with m.If(tlb_hit.valid):
 820             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 821         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 822
 823         with m.If(r0.req.virt_mode):
 824             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 825                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 826                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 827             comb += perm_attr.reference.eq(pte[8])
 828             comb += perm_attr.changed.eq(pte[7])
 829             comb += perm_attr.nocache.eq(pte[5])
 830             comb += perm_attr.priv.eq(pte[3])
 831             comb += perm_attr.rd_perm.eq(pte[2])
 832             comb += perm_attr.wr_perm.eq(pte[1])
 833         with m.Else():
 834             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 835                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 836             comb += perm_attr.reference.eq(1)
 837             comb += perm_attr.changed.eq(1)
 838             comb += perm_attr.nocache.eq(0)
 839             comb += perm_attr.priv.eq(1)
 840             comb += perm_attr.rd_perm.eq(1)
 841             comb += perm_attr.wr_perm.eq(1)
 842
 843         with m.If(valid_ra):
 844             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 845                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 846             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 847             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 848             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 849             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 850             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 851             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 852
 853     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 854                     tlb_hit, tlb_plru_victim):
 855
 856         comb = m.d.comb
 857         sync = m.d.sync
 858
 859         tlbie    = Signal()
 860         tlbwe    = Signal()
 861
 862         comb += tlbie.eq(r0_valid & r0.tlbie)
 863         comb += tlbwe.eq(r0_valid & r0.tlbld)
 864
 865         d = self.dtlb_update
 866
 867         comb += d.tlbie.eq(tlbie)
 868         comb += d.tlbwe.eq(tlbwe)
 869         comb += d.doall.eq(r0.doall)
 870         comb += d.tlb_hit.eq(tlb_hit)
 871         comb += d.tlb_req_index.eq(tlb_req_index)
 872
 873         with m.If(tlb_hit.valid):
 874             comb += d.repl_way.eq(tlb_hit.way)
 875         with m.Else():
 876             comb += d.repl_way.eq(tlb_plru_victim)
 877         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 878         comb += d.pte_data.eq(r0.req.data)
 879
 880     def maybe_plrus(self, m, r1, plru_victim):
 881         """Generate PLRUs
 882         """
 883         comb = m.d.comb
 884         sync = m.d.sync
 885
 886         if TLB_NUM_WAYS == 0:
 887             return
 888
 889         # suite of PLRUs with a selection and output mechanism
 890         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 891         comb += plrus.way.eq(r1.hit_way)
 892         comb += plrus.valid.eq(r1.cache_hit)
 893         comb += plrus.index.eq(r1.hit_index)
 894         comb += plrus.isel.eq(r1.store_index) # select victim
 895         comb += plru_victim.eq(plrus.o_index) # selected victim
 896
 897     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 898         """Cache tag RAM read port
 899         """
 900         comb = m.d.comb
 901         sync = m.d.sync
 902         m_in, d_in = self.m_in, self.d_in
 903
 904         index = Signal(INDEX_BITS)
 905
 906         with m.If(r0_stall):
 907             comb += index.eq(req_index)
 908         with m.Elif(m_in.valid):
 909             comb += index.eq(get_index(m_in.addr))
 910         with m.Else():
 911             comb += index.eq(get_index(d_in.addr))
 912         sync += cache_tag_set.eq(cache_tags[index].tag)
 913
 914     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 915                        r0_valid, r1, cache_tags, replace_way,
 916                        use_forward1_next, use_forward2_next,
 917                        req_hit_way, plru_victim, rc_ok, perm_attr,
 918                        valid_ra, perm_ok, access_ok, req_op, req_go,
 919                        tlb_hit, tlb_way, cache_tag_set,
 920                        cancel_store, req_same_tag, r0_stall, early_req_row):
 921         """Cache request parsing and hit detection
 922         """
 923
 924         comb = m.d.comb
 925         m_in, d_in = self.m_in, self.d_in
 926
 927         is_hit      = Signal()
 928         hit_way     = Signal(WAY_BITS)
 929         op          = Signal(Op)
 930         opsel       = Signal(3)
 931         go          = Signal()
 932         nc          = Signal()
 933         cache_i_validdx = Signal(NUM_WAYS)
 934
 935         # Extract line, row and tag from request
 936         comb += req_index.eq(get_index(r0.req.addr))
 937         comb += req_row.eq(get_row(r0.req.addr))
 938         comb += req_tag.eq(get_tag(ra))
 939
 940         if False: # display on comb is a bit... busy.
 941             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 942                     r0.req.addr, ra, req_index, req_tag, req_row)
 943
 944         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 945         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 946
 947         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 948                                             cache_i_validdx, cache_tag_set,
 949                                             r0.req.addr)
 950         comb += dc.tlb_hit.eq(tlb_hit)
 951         comb += dc.reload_tag.eq(r1.reload_tag)
 952         comb += dc.virt_mode.eq(r0.req.virt_mode)
 953         comb += dc.go.eq(go)
 954         comb += dc.req_index.eq(req_index)
 955
 956         comb += is_hit.eq(dc.is_hit)
 957         comb += hit_way.eq(dc.hit_way)
 958         comb += req_same_tag.eq(dc.rel_match)
 959
 960         # See if the request matches the line currently being reloaded
 961         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 962                   (req_index == r1.store_index) & req_same_tag):
 963             # For a store, consider this a hit even if the row isn't
 964             # valid since it will be by the time we perform the store.
 965             # For a load, check the appropriate row valid bit.
 966             rrow = Signal(ROW_LINE_BITS)
 967             comb += rrow.eq(req_row)
 968             valid = r1.rows_valid[rrow]
 969             comb += is_hit.eq((~r0.req.load) | valid)
 970             comb += hit_way.eq(replace_way)
 971
 972         # Whether to use forwarded data for a load or not
 973         with m.If((get_row(r1.req.real_addr) == req_row) &
 974                   (r1.req.hit_way == hit_way)):
 975             # Only need to consider r1.write_bram here, since if we
 976             # are writing refill data here, then we don't have a
 977             # cache hit this cycle on the line being refilled.
 978             # (There is the possibility that the load following the
 979             # load miss that started the refill could be to the old
 980             # contents of the victim line, since it is a couple of
 981             # cycles after the refill starts before we see the updated
 982             # cache tag. In that case we don't use the bypass.)
 983             comb += use_forward1_next.eq(r1.write_bram)
 984         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 985             comb += use_forward2_next.eq(r1.forward_valid1)
 986
 987         # The way that matched on a hit
 988         comb += req_hit_way.eq(hit_way)
 989
 990         # The way to replace on a miss
 991         with m.If(r1.write_tag):
 992             comb += replace_way.eq(plru_victim)
 993         with m.Else():
 994             comb += replace_way.eq(r1.store_way)
 995
 996         # work out whether we have permission for this access
 997         # NB we don't yet implement AMR, thus no KUAP
 998         comb += rc_ok.eq(perm_attr.reference
 999                          & (r0.req.load | perm_attr.changed))
1000         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1001                            (perm_attr.wr_perm |
1002                               (r0.req.load & perm_attr.rd_perm)))
1003         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1004
1005         # Combine the request and cache hit status to decide what
1006         # operation needs to be done
1007         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1008         comb += op.eq(Op.OP_NONE)
1009         with m.If(go):
1010             with m.If(~access_ok):
1011                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1012                                  valid_ra, perm_ok, rc_ok)
1013                 comb += op.eq(Op.OP_BAD)
1014             with m.Elif(cancel_store):
1015                 m.d.sync += Display("DCACHE cancel store")
1016                 comb += op.eq(Op.OP_STCX_FAIL)
1017             with m.Else():
1018                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1019                                  valid_ra, nc, r0.req.load)
1020                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1021                 with m.Switch(opsel):
1022                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1023                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1024                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1025                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1026                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1027                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1028                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1029                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1030         comb += req_op.eq(op)
1031         comb += req_go.eq(go)
1032
1033         # Version of the row number that is valid one cycle earlier
1034         # in the cases where we need to read the cache data BRAM.
1035         # If we're stalling then we need to keep reading the last
1036         # row requested.
1037         with m.If(~r0_stall):
1038             with m.If(m_in.valid):
1039                 comb += early_req_row.eq(get_row(m_in.addr))
1040             with m.Else():
1041                 comb += early_req_row.eq(get_row(d_in.addr))
1042         with m.Else():
1043             comb += early_req_row.eq(req_row)
1044
1045     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1046                          r0_valid, r0, reservation):
1047         """Handle load-with-reservation and store-conditional instructions
1048         """
1049         comb = m.d.comb
1050
1051         with m.If(r0_valid & r0.req.reserve):
1052             # XXX generate alignment interrupt if address
1053             # is not aligned XXX or if r0.req.nc = '1'
1054             with m.If(r0.req.load):
1055                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1056             with m.Else():
1057                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1058                 with m.If((~reservation.valid) |
1059                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1060                     comb += cancel_store.eq(1)
1061
1062     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1063                         reservation, r0):
1064         comb = m.d.comb
1065         sync = m.d.sync
1066
1067         with m.If(r0_valid & access_ok):
1068             with m.If(clear_rsrv):
1069                 sync += reservation.valid.eq(0)
1070             with m.Elif(set_rsrv):
1071                 sync += reservation.valid.eq(1)
1072                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1073
1074     def writeback_control(self, m, r1, cache_out_row):
1075         """Return data for loads & completion control logic
1076         """
1077         comb = m.d.comb
1078         sync = m.d.sync
1079         d_out, m_out = self.d_out, self.m_out
1080
1081         data_out = Signal(64)
1082         data_fwd = Signal(64)
1083
1084         # Use the bypass if are reading the row that was
1085         # written 1 or 2 cycles ago, including for the
1086         # slow_valid = 1 case (i.e. completing a load
1087         # miss or a non-cacheable load).
1088         with m.If(r1.use_forward1):
1089             comb += data_fwd.eq(r1.forward_data1)
1090         with m.Else():
1091             comb += data_fwd.eq(r1.forward_data2)
1092
1093         comb += data_out.eq(cache_out_row)
1094
1095         for i in range(8):
1096             with m.If(r1.forward_sel[i]):
1097                 dsel = data_fwd.word_select(i, 8)
1098                 comb += data_out.word_select(i, 8).eq(dsel)
1099
1100         # DCache output to LoadStore
1101         comb += d_out.valid.eq(r1.ls_valid)
1102         comb += d_out.data.eq(data_out)
1103         comb += d_out.store_done.eq(~r1.stcx_fail)
1104         comb += d_out.error.eq(r1.ls_error)
1105         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1106
1107         # Outputs to MMU
1108         comb += m_out.done.eq(r1.mmu_done)
1109         comb += m_out.err.eq(r1.mmu_error)
1110         comb += m_out.data.eq(data_out)
1111
1112         # We have a valid load or store hit or we just completed
1113         # a slow op such as a load miss, a NC load or a store
1114         #
1115         # Note: the load hit is delayed by one cycle. However it
1116         # can still not collide with r.slow_valid (well unless I
1117         # miscalculated) because slow_valid can only be set on a
1118         # subsequent request and not on its first cycle (the state
1119         # machine must have advanced), which makes slow_valid
1120         # at least 2 cycles from the previous hit_load_valid.
1121
1122         # Sanity: Only one of these must be set in any given cycle
1123
1124         if False: # TODO: need Display to get this to work
1125             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1126             "unexpected slow_valid collision with stcx_fail"
1127
1128             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1129              "unexpected hit_load_delayed collision with slow_valid"
1130
1131         with m.If(~r1.mmu_req):
1132             # Request came from loadstore1...
1133             # Load hit case is the standard path
1134             with m.If(r1.hit_load_valid):
1135                 sync += Display("completing load hit data=%x", data_out)
1136
1137             # error cases complete without stalling
1138             with m.If(r1.ls_error):
1139                 with m.If(r1.dcbz):
1140                     sync += Display("completing dcbz with error")
1141                 with m.Else():
1142                     sync += Display("completing ld/st with error")
1143
1144             # Slow ops (load miss, NC, stores)
1145             with m.If(r1.slow_valid):
1146                 sync += Display("completing store or load miss adr=%x data=%x",
1147                                 r1.req.real_addr, data_out)
1148
1149         with m.Else():
1150             # Request came from MMU
1151             with m.If(r1.hit_load_valid):
1152                 sync += Display("completing load hit to MMU, data=%x",
1153                                 m_out.data)
1154             # error cases complete without stalling
1155             with m.If(r1.mmu_error):
1156                 sync += Display("combpleting MMU ld with error")
1157
1158             # Slow ops (i.e. load miss)
1159             with m.If(r1.slow_valid):
1160                 sync += Display("completing MMU load miss, adr=%x data=%x",
1161                                 r1.req.real_addr, m_out.data)
1162
1163     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1164         """rams
1165         Generate a cache RAM for each way. This handles the normal
1166         reads, writes from reloads and the special store-hit update
1167         path as well.
1168
1169         Note: the BRAMs have an extra read buffer, meaning the output
1170         is pipelined an extra cycle. This differs from the
1171         icache. The writeback logic needs to take that into
1172         account by using 1-cycle delayed signals for load hits.
1173         """
1174         comb = m.d.comb
1175         bus = self.bus
1176
1177         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1178         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1179         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1180         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1181                    ~r1.write_bram))
1182         comb += rwe.i.eq(replace_way)
1183
1184         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1185         comb += hwe.i.eq(r1.hit_way)
1186
1187         # this one is gated with write_bram, and replace_way_e can never be
1188         # set at the same time.  that means that do_write can OR the outputs
1189         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1190         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1191         comb += hre.i.eq(r1.req.hit_way)
1192
1193         # common Signals
1194         do_read  = Signal()
1195         wr_addr  = Signal(ROW_BITS)
1196         wr_data  = Signal(WB_DATA_BITS)
1197         wr_sel   = Signal(ROW_SIZE)
1198         rd_addr  = Signal(ROW_BITS)
1199
1200         comb += do_read.eq(1) # always enable
1201         comb += rd_addr.eq(early_req_row)
1202
1203         # Write mux:
1204         #
1205         # Defaults to wishbone read responses (cache refill)
1206         #
1207         # For timing, the mux on wr_data/sel/addr is not
1208         # dependent on anything other than the current state.
1209
1210         with m.If(r1.write_bram):
1211             # Write store data to BRAM.  This happens one
1212             # cycle after the store is in r0.
1213             comb += wr_data.eq(r1.req.data)
1214             comb += wr_sel.eq(r1.req.byte_sel)
1215             comb += wr_addr.eq(get_row(r1.req.real_addr))
1216
1217         with m.Else():
1218             # Otherwise, we might be doing a reload or a DCBZ
1219             with m.If(r1.dcbz):
1220                 comb += wr_data.eq(0)
1221             with m.Else():
1222                 comb += wr_data.eq(bus.dat_r)
1223             comb += wr_addr.eq(r1.store_row)
1224             comb += wr_sel.eq(~0) # all 1s
1225
1226         # set up Cache Rams
1227         for i in range(NUM_WAYS):
1228             do_write = Signal(name="do_wr%d" % i)
1229             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1230             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1231
1232             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1233             setattr(m.submodules, "cacheram_%d" % i, way)
1234
1235             comb += way.rd_en.eq(do_read)
1236             comb += way.rd_addr.eq(rd_addr)
1237             comb += d_out.eq(way.rd_data_o)
1238             comb += way.wr_sel.eq(wr_sel_m)
1239             comb += way.wr_addr.eq(wr_addr)
1240             comb += way.wr_data.eq(wr_data)
1241
1242             # Cache hit reads
1243             with m.If(hwe.o[i]):
1244                 comb += cache_out_row.eq(d_out)
1245
1246             # these are mutually-exclusive via their Decoder-enablers
1247             # (note: Decoder-enable is inverted)
1248             comb += do_write.eq(hre.o[i] | rwe.o[i])
1249
1250             # Mask write selects with do_write since BRAM
1251             # doesn't have a global write-enable
1252             with m.If(do_write):
1253                 comb += wr_sel_m.eq(wr_sel)
1254
1255     # Cache hit synchronous machine for the easy case.
1256     # This handles load hits.
1257     # It also handles error cases (TLB miss, cache paradox)
1258     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1259                         req_hit_way, req_index, req_tag, access_ok,
1260                         tlb_hit, tlb_req_index):
1261         comb = m.d.comb
1262         sync = m.d.sync
1263
1264         with m.If(req_op != Op.OP_NONE):
1265             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1266                     req_op, r0.req.addr, r0.req.nc,
1267                     req_index, req_tag, req_hit_way)
1268
1269         with m.If(r0_valid):
1270             sync += r1.mmu_req.eq(r0.mmu_req)
1271
1272         # Fast path for load/store hits.
1273         # Set signals for the writeback controls.
1274         sync += r1.hit_way.eq(req_hit_way)
1275         sync += r1.hit_index.eq(req_index)
1276
1277         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1278         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1279                                 (req_op == Op.OP_STORE_HIT))
1280
1281         with m.If(req_op == Op.OP_BAD):
1282             sync += Display("Signalling ld/st error "
1283                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1284                             ~r0.mmu_req,r0.mmu_req,access_ok)
1285             sync += r1.ls_error.eq(~r0.mmu_req)
1286             sync += r1.mmu_error.eq(r0.mmu_req)
1287             sync += r1.cache_paradox.eq(access_ok)
1288         with m.Else():
1289             sync += r1.ls_error.eq(0)
1290             sync += r1.mmu_error.eq(0)
1291             sync += r1.cache_paradox.eq(0)
1292
1293         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1294
1295         # Record TLB hit information for updating TLB PLRU
1296         sync += r1.tlb_hit.eq(tlb_hit)
1297         sync += r1.tlb_hit_index.eq(tlb_req_index)
1298
1299     # Memory accesses are handled by this state machine:
1300     #
1301     #   * Cache load miss/reload (in conjunction with "rams")
1302     #   * Load hits for non-cachable forms
1303     #   * Stores (the collision case is handled in "rams")
1304     #
1305     # All wishbone requests generation is done here.
1306     # This machine operates at stage 1.
1307     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1308                     r0, replace_way,
1309                     req_hit_way, req_same_tag,
1310                     r0_valid, req_op, cache_tags, req_go, ra):
1311
1312         comb = m.d.comb
1313         sync = m.d.sync
1314         bus = self.bus
1315         d_in = self.d_in
1316
1317         req         = MemAccessRequest("mreq_ds")
1318
1319         req_row = Signal(ROW_BITS)
1320         req_idx = Signal(INDEX_BITS)
1321         req_tag = Signal(TAG_BITS)
1322         comb += req_idx.eq(get_index(req.real_addr))
1323         comb += req_row.eq(get_row(req.real_addr))
1324         comb += req_tag.eq(get_tag(req.real_addr))
1325
1326         sync += r1.use_forward1.eq(use_forward1_next)
1327         sync += r1.forward_sel.eq(0)
1328
1329         with m.If(use_forward1_next):
1330             sync += r1.forward_sel.eq(r1.req.byte_sel)
1331         with m.Elif(use_forward2_next):
1332             sync += r1.forward_sel.eq(r1.forward_sel1)
1333
1334         sync += r1.forward_data2.eq(r1.forward_data1)
1335         with m.If(r1.write_bram):
1336             sync += r1.forward_data1.eq(r1.req.data)
1337             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1338             sync += r1.forward_way1.eq(r1.req.hit_way)
1339             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1340             sync += r1.forward_valid1.eq(1)
1341         with m.Else():
1342             with m.If(r1.dcbz):
1343                 sync += r1.forward_data1.eq(0)
1344             with m.Else():
1345                 sync += r1.forward_data1.eq(bus.dat_r)
1346             sync += r1.forward_sel1.eq(~0) # all 1s
1347             sync += r1.forward_way1.eq(replace_way)
1348             sync += r1.forward_row1.eq(r1.store_row)
1349             sync += r1.forward_valid1.eq(0)
1350
1351         # One cycle pulses reset
1352         sync += r1.slow_valid.eq(0)
1353         sync += r1.write_bram.eq(0)
1354         sync += r1.inc_acks.eq(0)
1355         sync += r1.dec_acks.eq(0)
1356
1357         sync += r1.ls_valid.eq(0)
1358         # complete tlbies and TLB loads in the third cycle
1359         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1360
1361         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1362             with m.If(~r0.mmu_req):
1363                 sync += r1.ls_valid.eq(1)
1364             with m.Else():
1365                 sync += r1.mmu_done.eq(1)
1366
1367         with m.If(r1.write_tag):
1368             # Store new tag in selected way
1369             replace_way_onehot = Signal(NUM_WAYS)
1370             comb += replace_way_onehot.eq(1<<replace_way)
1371             for i in range(NUM_WAYS):
1372                 with m.If(replace_way_onehot[i]):
1373                     ct = Signal(TAG_RAM_WIDTH)
1374                     comb += ct.eq(cache_tags[r1.store_index].tag)
1375                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1376                     sync += cache_tags[r1.store_index].tag.eq(ct)
1377             sync += r1.store_way.eq(replace_way)
1378             sync += r1.write_tag.eq(0)
1379
1380         # Take request from r1.req if there is one there,
1381         # else from req_op, ra, etc.
1382         with m.If(r1.full):
1383             comb += req.eq(r1.req)
1384         with m.Else():
1385             comb += req.op.eq(req_op)
1386             comb += req.valid.eq(req_go)
1387             comb += req.mmu_req.eq(r0.mmu_req)
1388             comb += req.dcbz.eq(r0.req.dcbz)
1389             comb += req.real_addr.eq(ra)
1390
1391             with m.If(r0.req.dcbz):
1392                 # force data to 0 for dcbz
1393                 comb += req.data.eq(0)
1394             with m.Elif(r0.d_valid):
1395                 comb += req.data.eq(r0.req.data)
1396             with m.Else():
1397                 comb += req.data.eq(d_in.data)
1398
1399             # Select all bytes for dcbz
1400             # and for cacheable loads
1401             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1402                 comb += req.byte_sel.eq(~0) # all 1s
1403             with m.Else():
1404                 comb += req.byte_sel.eq(r0.req.byte_sel)
1405             comb += req.hit_way.eq(req_hit_way)
1406             comb += req.same_tag.eq(req_same_tag)
1407
1408             # Store the incoming request from r0,
1409             # if it is a slow request
1410             # Note that r1.full = 1 implies req_op = OP_NONE
1411             with m.If((req_op == Op.OP_LOAD_MISS)
1412                       | (req_op == Op.OP_LOAD_NC)
1413                       | (req_op == Op.OP_STORE_MISS)
1414                       | (req_op == Op.OP_STORE_HIT)):
1415                 sync += r1.req.eq(req)
1416                 sync += r1.full.eq(1)
1417
1418         # Main state machine
1419         with m.Switch(r1.state):
1420
1421             with m.Case(State.IDLE):
1422                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1423                 sync += r1.wb.sel.eq(req.byte_sel)
1424                 sync += r1.wb.dat.eq(req.data)
1425                 sync += r1.dcbz.eq(req.dcbz)
1426
1427                 # Keep track of our index and way
1428                 # for subsequent stores.
1429                 sync += r1.store_index.eq(req_idx)
1430                 sync += r1.store_row.eq(req_row)
1431                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1432                 sync += r1.reload_tag.eq(req_tag)
1433                 sync += r1.req.same_tag.eq(1)
1434
1435                 with m.If(req.op == Op.OP_STORE_HIT):
1436                     sync += r1.store_way.eq(req.hit_way)
1437
1438                 # Reset per-row valid bits,
1439                 # ready for handling OP_LOAD_MISS
1440                 for i in range(ROW_PER_LINE):
1441                     sync += r1.rows_valid[i].eq(0)
1442
1443                 with m.If(req_op != Op.OP_NONE):
1444                     sync += Display("cache op %d", req.op)
1445
1446                 with m.Switch(req.op):
1447                     with m.Case(Op.OP_LOAD_HIT):
1448                         # stay in IDLE state
1449                         pass
1450
1451                     with m.Case(Op.OP_LOAD_MISS):
1452                         sync += Display("cache miss real addr: %x " \
1453                                 "idx: %x tag: %x",
1454                                 req.real_addr, req_row, req_tag)
1455
1456                         # Start the wishbone cycle
1457                         sync += r1.wb.we.eq(0)
1458                         sync += r1.wb.cyc.eq(1)
1459                         sync += r1.wb.stb.eq(1)
1460
1461                         # Track that we had one request sent
1462                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1463                         sync += r1.write_tag.eq(1)
1464
1465                     with m.Case(Op.OP_LOAD_NC):
1466                         sync += r1.wb.cyc.eq(1)
1467                         sync += r1.wb.stb.eq(1)
1468                         sync += r1.wb.we.eq(0)
1469                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1470
1471                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1472                         with m.If(~req.dcbz):
1473                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1474                             sync += r1.acks_pending.eq(1)
1475                             sync += r1.full.eq(0)
1476                             sync += r1.slow_valid.eq(1)
1477
1478                             with m.If(~req.mmu_req):
1479                                 sync += r1.ls_valid.eq(1)
1480                             with m.Else():
1481                                 sync += r1.mmu_done.eq(1)
1482
1483                             with m.If(req.op == Op.OP_STORE_HIT):
1484                                 sync += r1.write_bram.eq(1)
1485                         with m.Else():
1486                             # dcbz is handled much like a load miss except
1487                             # that we are writing to memory instead of reading
1488                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1489
1490                             with m.If(req.op == Op.OP_STORE_MISS):
1491                                 sync += r1.write_tag.eq(1)
1492
1493                         sync += r1.wb.we.eq(1)
1494                         sync += r1.wb.cyc.eq(1)
1495                         sync += r1.wb.stb.eq(1)
1496
1497                     # OP_NONE and OP_BAD do nothing
1498                     # OP_BAD & OP_STCX_FAIL were
1499                     # handled above already
1500                     with m.Case(Op.OP_NONE):
1501                         pass
1502                     with m.Case(Op.OP_BAD):
1503                         pass
1504                     with m.Case(Op.OP_STCX_FAIL):
1505                         pass
1506
1507             with m.Case(State.RELOAD_WAIT_ACK):
1508                 ld_stbs_done = Signal()
1509                 # Requests are all sent if stb is 0
1510                 comb += ld_stbs_done.eq(~r1.wb.stb)
1511
1512                 # If we are still sending requests, was one accepted?
1513                 with m.If((~bus.stall) & r1.wb.stb):
1514                     # That was the last word?  We are done sending.
1515                     # Clear stb and set ld_stbs_done so we can handle an
1516                     # eventual last ack on the same cycle.
1517                     # sigh - reconstruct wb adr with 3 extra 0s at front
1518                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1519                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1520                         sync += r1.wb.stb.eq(0)
1521                         comb += ld_stbs_done.eq(1)
1522
1523                     # Calculate the next row address in the current cache line
1524                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1525                     comb += row.eq(r1.wb.adr)
1526                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1527
1528                 # Incoming acks processing
1529                 sync += r1.forward_valid1.eq(bus.ack)
1530                 with m.If(bus.ack):
1531                     srow = Signal(ROW_LINE_BITS)
1532                     comb += srow.eq(r1.store_row)
1533                     sync += r1.rows_valid[srow].eq(1)
1534
1535                     # If this is the data we were looking for,
1536                     # we can complete the request next cycle.
1537                     # Compare the whole address in case the
1538                     # request in r1.req is not the one that
1539                     # started this refill.
1540                     with m.If(req.valid & r1.req.same_tag &
1541                               ((r1.dcbz & r1.req.dcbz) |
1542                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1543                                 (r1.store_row == get_row(req.real_addr))):
1544                         sync += r1.full.eq(0)
1545                         sync += r1.slow_valid.eq(1)
1546                         with m.If(~r1.mmu_req):
1547                             sync += r1.ls_valid.eq(1)
1548                         with m.Else():
1549                             sync += r1.mmu_done.eq(1)
1550                         sync += r1.forward_sel.eq(~0) # all 1s
1551                         sync += r1.use_forward1.eq(1)
1552
1553                     # Check for completion
1554                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1555                                                       r1.end_row_ix)):
1556                         # Complete wishbone cycle
1557                         sync += r1.wb.cyc.eq(0)
1558
1559                         # Cache line is now valid
1560                         cv = Signal(INDEX_BITS)
1561                         comb += cv.eq(cache_tags[r1.store_index].valid)
1562                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1563                         sync += cache_tags[r1.store_index].valid.eq(cv)
1564
1565                         sync += r1.state.eq(State.IDLE)
1566                         sync += Display("cache valid set %x "
1567                                         "idx %d way %d",
1568                                          cv, r1.store_index, r1.store_way)
1569
1570                     # Increment store row counter
1571                     sync += r1.store_row.eq(next_row(r1.store_row))
1572
1573             with m.Case(State.STORE_WAIT_ACK):
1574                 st_stbs_done = Signal()
1575                 acks        = Signal(3)
1576                 adjust_acks = Signal(3)
1577
1578                 comb += st_stbs_done.eq(~r1.wb.stb)
1579                 comb += acks.eq(r1.acks_pending)
1580
1581                 with m.If(r1.inc_acks != r1.dec_acks):
1582                     with m.If(r1.inc_acks):
1583                         comb += adjust_acks.eq(acks + 1)
1584                     with m.Else():
1585                         comb += adjust_acks.eq(acks - 1)
1586                 with m.Else():
1587                     comb += adjust_acks.eq(acks)
1588
1589                 sync += r1.acks_pending.eq(adjust_acks)
1590
1591                 # Clear stb when slave accepted request
1592                 with m.If(~bus.stall):
1593                     # See if there is another store waiting
1594                     # to be done which is in the same real page.
1595                     with m.If(req.valid):
1596                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1597                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1598                         sync += r1.wb.dat.eq(req.data)
1599                         sync += r1.wb.sel.eq(req.byte_sel)
1600
1601                     with m.If((adjust_acks < 7) & req.same_tag &
1602                                 ((req.op == Op.OP_STORE_MISS)
1603                                  | (req.op == Op.OP_STORE_HIT))):
1604                         sync += r1.wb.stb.eq(1)
1605                         comb += st_stbs_done.eq(0)
1606
1607                         with m.If(req.op == Op.OP_STORE_HIT):
1608                             sync += r1.write_bram.eq(1)
1609                         sync += r1.full.eq(0)
1610                         sync += r1.slow_valid.eq(1)
1611
1612                         # Store requests never come from the MMU
1613                         sync += r1.ls_valid.eq(1)
1614                         comb += st_stbs_done.eq(0)
1615                         sync += r1.inc_acks.eq(1)
1616                     with m.Else():
1617                         sync += r1.wb.stb.eq(0)
1618                         comb += st_stbs_done.eq(1)
1619
1620                 # Got ack ? See if complete.
1621                 with m.If(bus.ack):
1622                     with m.If(st_stbs_done & (adjust_acks == 1)):
1623                         sync += r1.state.eq(State.IDLE)
1624                         sync += r1.wb.cyc.eq(0)
1625                         sync += r1.wb.stb.eq(0)
1626                     sync += r1.dec_acks.eq(1)
1627
1628             with m.Case(State.NC_LOAD_WAIT_ACK):
1629                 # Clear stb when slave accepted request
1630                 with m.If(~bus.stall):
1631                     sync += r1.wb.stb.eq(0)
1632
1633                 # Got ack ? complete.
1634                 with m.If(bus.ack):
1635                     sync += r1.state.eq(State.IDLE)
1636                     sync += r1.full.eq(0)
1637                     sync += r1.slow_valid.eq(1)
1638
1639                     with m.If(~r1.mmu_req):
1640                         sync += r1.ls_valid.eq(1)
1641                     with m.Else():
1642                         sync += r1.mmu_done.eq(1)
1643
1644                     sync += r1.forward_sel.eq(~0) # all 1s
1645                     sync += r1.use_forward1.eq(1)
1646                     sync += r1.wb.cyc.eq(0)
1647                     sync += r1.wb.stb.eq(0)
1648
1649     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1650
1651         sync = m.d.sync
1652         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1653
1654         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1655                                stall_out, req_op[:3], d_out.valid, d_out.error,
1656                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1657                                r1.real_adr[3:6]))
1658
1659     def elaborate(self, platform):
1660
1661         m = Module()
1662         comb = m.d.comb
1663         d_in = self.d_in
1664
1665         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1666         cache_tags       = CacheTagArray()
1667         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1668
1669         # TODO attribute ram_style : string;
1670         # TODO attribute ram_style of cache_tags : signal is "distributed";
1671
1672         """note: these are passed to nmigen.hdl.Memory as "attributes".
1673            don't know how, just that they are.
1674         """
1675         # TODO attribute ram_style of
1676         #  dtlb_tags : signal is "distributed";
1677         # TODO attribute ram_style of
1678         #  dtlb_ptes : signal is "distributed";
1679
1680         r0      = RegStage0("r0")
1681         r0_full = Signal()
1682
1683         r1 = RegStage1("r1")
1684
1685         reservation = Reservation()
1686
1687         # Async signals on incoming request
1688         req_index    = Signal(INDEX_BITS)
1689         req_row      = Signal(ROW_BITS)
1690         req_hit_way  = Signal(WAY_BITS)
1691         req_tag      = Signal(TAG_BITS)
1692         req_op       = Signal(Op)
1693         req_data     = Signal(64)
1694         req_same_tag = Signal()
1695         req_go       = Signal()
1696
1697         early_req_row     = Signal(ROW_BITS)
1698
1699         cancel_store      = Signal()
1700         set_rsrv          = Signal()
1701         clear_rsrv        = Signal()
1702
1703         r0_valid          = Signal()
1704         r0_stall          = Signal()
1705
1706         use_forward1_next = Signal()
1707         use_forward2_next = Signal()
1708
1709         cache_out_row     = Signal(WB_DATA_BITS)
1710
1711         plru_victim       = Signal(WAY_BITS)
1712         replace_way       = Signal(WAY_BITS)
1713
1714         # Wishbone read/write/cache write formatting signals
1715         bus_sel           = Signal(8)
1716
1717         # TLB signals
1718         tlb_way       = TLBRecord("tlb_way")
1719         tlb_req_index = Signal(TLB_SET_BITS)
1720         tlb_hit       = TLBHit("tlb_hit")
1721         pte           = Signal(TLB_PTE_BITS)
1722         ra            = Signal(REAL_ADDR_BITS)
1723         valid_ra      = Signal()
1724         perm_attr     = PermAttr("dc_perms")
1725         rc_ok         = Signal()
1726         perm_ok       = Signal()
1727         access_ok     = Signal()
1728
1729         tlb_plru_victim = Signal(TLB_WAY_BITS)
1730
1731         # we don't yet handle collisions between loadstore1 requests
1732         # and MMU requests
1733         comb += self.m_out.stall.eq(0)
1734
1735         # Hold off the request in r0 when r1 has an uncompleted request
1736         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1737         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1738         comb += self.stall_out.eq(r0_stall)
1739
1740         # deal with litex not doing wishbone pipeline mode
1741         # XXX in wrong way.  FIFOs are needed in the SRAM test
1742         # so that stb/ack match up. same thing done in icache.py
1743         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1744
1745         # Wire up wishbone request latch out of stage 1
1746         comb += self.bus.we.eq(r1.wb.we)
1747         comb += self.bus.adr.eq(r1.wb.adr)
1748         comb += self.bus.sel.eq(r1.wb.sel)
1749         comb += self.bus.stb.eq(r1.wb.stb)
1750         comb += self.bus.dat_w.eq(r1.wb.dat)
1751         comb += self.bus.cyc.eq(r1.wb.cyc)
1752
1753         # create submodule TLBUpdate
1754         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1755
1756         # call sub-functions putting everything together, using shared
1757         # signals established above
1758         self.stage_0(m, r0, r1, r0_full)
1759         self.tlb_read(m, r0_stall, tlb_way)
1760         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1761                         tlb_way,
1762                         pte, tlb_hit, valid_ra, perm_attr, ra)
1763         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1764                         tlb_hit, tlb_plru_victim)
1765         self.maybe_plrus(m, r1, plru_victim)
1766         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1767         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1768         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1769                            r0_valid, r1, cache_tags, replace_way,
1770                            use_forward1_next, use_forward2_next,
1771                            req_hit_way, plru_victim, rc_ok, perm_attr,
1772                            valid_ra, perm_ok, access_ok, req_op, req_go,
1773                            tlb_hit, tlb_way, cache_tag_set,
1774                            cancel_store, req_same_tag, r0_stall, early_req_row)
1775         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1776                            r0_valid, r0, reservation)
1777         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1778                            reservation, r0)
1779         self.writeback_control(m, r1, cache_out_row)
1780         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1781         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1782                         req_hit_way, req_index, req_tag, access_ok,
1783                         tlb_hit, tlb_req_index)
1784         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1785                     r0, replace_way,
1786                     req_hit_way, req_same_tag,
1787                          r0_valid, req_op, cache_tags, req_go, ra)
1788         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1789
1790         return m
1791
1792
1793 if __name__ == '__main__':
1794     dut = DCache()
1795     vl = rtlil.convert(dut, ports=[])
1796     with open("test_dcache.il", "w") as f:
1797         f.write(vl)