src/soc/experiment/dcache.py

   1 #!/usr/bin/env python3
   2 #
   3 # Copyright (C) 2020,2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
   4 # Copyright (C) 2020 Cole Poirier
   5 # Copyright (C) 2020,2021 Cesar Strauss
   6 # Copyright (C) 2021 Tobias Platen
   7 #
   8 # Original dcache.vhdl Copyright of its authors and licensed
   9 # by IBM under CC-BY 4.0
  10 # https://github.com/antonblanchard/microwatt
  11 #
  12 # Conversion to nmigen funded by NLnet and NGI POINTER under EU Grants
  13 # 871528 and 957073, under the LGPL-v3+ License
  14
  15 """DCache
  16
  17 based on Anton Blanchard microwatt dcache.vhdl
  18
  19 note that the microwatt dcache wishbone interface expects "stall".
  20 for simplicity at the moment this is hard-coded to cyc & ~ack.
  21 see WB4 spec, p84, section 5.2.1
  22
  23 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  24 is raised.  sigh
  25
  26 Links:
  27
  28 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  29 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  30 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  31   (discussion about brams for ECP5)
  32
  33 """
  34
  35 import sys
  36
  37 from nmutil.gtkw import write_gtkw
  38
  39 sys.setrecursionlimit(1000000)
  40
  41 from enum import Enum, unique
  42
  43 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  44                     Record, Memory)
  45 from nmutil.util import Display
  46 from nmigen.lib.coding import Decoder
  47
  48 from copy import deepcopy
  49 from random import randint, seed
  50
  51 from nmigen_soc.wishbone.bus import Interface
  52
  53 from nmigen.cli import main
  54 from nmutil.iocontrol import RecordObject
  55 from nmigen.utils import log2_int
  56 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  57                                      DCacheToLoadStore1Type,
  58                                      MMUToDCacheType,
  59                                      DCacheToMMUType)
  60
  61 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  62                                 WBAddrType, WBDataType, WBSelType,
  63                                 WBMasterOut, WBSlaveOut,
  64                                 WBMasterOutVector, WBSlaveOutVector,
  65                                 WBIOMasterOut, WBIOSlaveOut)
  66
  67 from soc.experiment.cache_ram import CacheRam
  68 from soc.experiment.plru import PLRU, PLRUs
  69 #from nmutil.plru import PLRU, PLRUs
  70
  71 # for test
  72 from soc.bus.sram import SRAM
  73 from nmigen import Memory
  74 from nmigen.cli import rtlil
  75
  76 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  77 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  78 from nmutil.sim_tmp_alternative import Simulator
  79
  80 from nmutil.util import wrap
  81
  82
  83 # TODO: make these parameters of DCache at some point
  84 LINE_SIZE = 64    # Line size in bytes
  85 NUM_LINES = 64    # Number of lines in a set
  86 NUM_WAYS = 2      # Number of ways
  87 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  88 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  89 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  90 LOG_LENGTH = 0    # Non-zero to enable log data collection
  91
  92 # BRAM organisation: We never access more than
  93 #     -- WB_DATA_BITS at a time so to save
  94 #     -- resources we make the array only that wide, and
  95 #     -- use consecutive indices to make a cache "line"
  96 #     --
  97 #     -- ROW_SIZE is the width in bytes of the BRAM
  98 #     -- (based on WB, so 64-bits)
  99 ROW_SIZE = WB_DATA_BITS // 8;
 100
 101 # ROW_PER_LINE is the number of row (wishbone
 102 # transactions) in a line
 103 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
 104
 105 # BRAM_ROWS is the number of rows in BRAM needed
 106 # to represent the full dcache
 107 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
 108
 109 print ("ROW_SIZE", ROW_SIZE)
 110 print ("ROW_PER_LINE", ROW_PER_LINE)
 111 print ("BRAM_ROWS", BRAM_ROWS)
 112 print ("NUM_WAYS", NUM_WAYS)
 113
 114 # Bit fields counts in the address
 115
 116 # REAL_ADDR_BITS is the number of real address
 117 # bits that we store
 118 REAL_ADDR_BITS = 56
 119
 120 # ROW_BITS is the number of bits to select a row
 121 ROW_BITS = log2_int(BRAM_ROWS)
 122
 123 # ROW_LINE_BITS is the number of bits to select
 124 # a row within a line
 125 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 126
 127 # LINE_OFF_BITS is the number of bits for
 128 # the offset in a cache line
 129 LINE_OFF_BITS = log2_int(LINE_SIZE)
 130
 131 # ROW_OFF_BITS is the number of bits for
 132 # the offset in a row
 133 ROW_OFF_BITS = log2_int(ROW_SIZE)
 134
 135 # INDEX_BITS is the number if bits to
 136 # select a cache line
 137 INDEX_BITS = log2_int(NUM_LINES)
 138
 139 # SET_SIZE_BITS is the log base 2 of the set size
 140 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 141
 142 # TAG_BITS is the number of bits of
 143 # the tag part of the address
 144 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 145
 146 # TAG_WIDTH is the width in bits of each way of the tag RAM
 147 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 148
 149 # WAY_BITS is the number of bits to select a way
 150 WAY_BITS = log2_int(NUM_WAYS)
 151
 152 # Example of layout for 32 lines of 64 bytes:
 153 layout = f"""\
 154   DCache Layout:
 155  |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
 156   ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
 157   ..  tag    |index|  line  |
 158   ..         |   row   |    |
 159   ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
 160   ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
 161   ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
 162   ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
 163   ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
 164   .. --------|              | TAG_BITS      ({TAG_BITS})
 165 """
 166 print (layout)
 167 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 168             (TAG_BITS, INDEX_BITS, ROW_BITS,
 169              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 170 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 171 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 172 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 173
 174 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 175
 176 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 177 print ("    TAG_WIDTH", TAG_WIDTH)
 178 print ("     NUM_WAYS", NUM_WAYS)
 179 print ("    NUM_LINES", NUM_LINES)
 180
 181
 182 def CacheTagArray():
 183     return Array(Signal(TAG_RAM_WIDTH, name="tag%d" % x) \
 184                    for x in range(NUM_LINES))
 185
 186 def CacheValidsArray():
 187     return Array(Signal(NUM_WAYS, name="tag_valids%d" % x)
 188                  for x in range(NUM_LINES))
 189
 190 def RowPerLineValidArray():
 191     return Array(Signal(name="rows_valid%d" % x) \
 192                         for x in range(ROW_PER_LINE))
 193
 194
 195 # L1 TLB
 196 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 197 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 198 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 199 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 200 TLB_PTE_BITS     = 64
 201 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 202
 203 def ispow2(x):
 204     return (1<<log2_int(x, False)) == x
 205
 206 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 207 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 208 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 209 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 210 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 211 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 212         "geometry bits don't add up"
 213 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 214         "geometry bits don't add up"
 215 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 216          "geometry bits don't add up"
 217 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 218 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 219
 220
 221 def TLBHit(name):
 222     return Record([('valid', 1),
 223                    ('way', TLB_WAY_BITS)], name=name)
 224
 225 def TLBTagEAArray():
 226     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 227                 for x in range (TLB_NUM_WAYS))
 228
 229 def TLBRecord(name):
 230     tlb_layout = [('valid', TLB_NUM_WAYS),
 231                   ('tag', TLB_TAG_WAY_BITS),
 232                   ('pte', TLB_PTE_WAY_BITS)
 233                  ]
 234     return Record(tlb_layout, name=name)
 235
 236 def TLBValidArray():
 237     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 238                         for x in range(TLB_SET_SIZE))
 239
 240 def HitWaySet():
 241     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 242                         for x in range(TLB_NUM_WAYS))
 243
 244 # Cache RAM interface
 245 def CacheRamOut():
 246     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 247                  for x in range(NUM_WAYS))
 248
 249 # PLRU output interface
 250 def PLRUOut():
 251     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 252                 for x in range(NUM_LINES))
 253
 254 # TLB PLRU output interface
 255 def TLBPLRUOut():
 256     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 257                 for x in range(TLB_SET_SIZE))
 258
 259 # Helper functions to decode incoming requests
 260 #
 261 # Return the cache line index (tag index) for an address
 262 def get_index(addr):
 263     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 264
 265 # Return the cache row index (data memory) for an address
 266 def get_row(addr):
 267     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 268
 269 # Return the index of a row within a line
 270 def get_row_of_line(row):
 271     return row[:ROW_BITS][:ROW_LINE_BITS]
 272
 273 # Returns whether this is the last row of a line
 274 def is_last_row_addr(addr, last):
 275     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 276
 277 # Returns whether this is the last row of a line
 278 def is_last_row(row, last):
 279     return get_row_of_line(row) == last
 280
 281 # Return the next row in the current cache line. We use a
 282 # dedicated function in order to limit the size of the
 283 # generated adder to be only the bits within a cache line
 284 # (3 bits with default settings)
 285 def next_row(row):
 286     row_v = row[0:ROW_LINE_BITS] + 1
 287     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 288
 289 # Get the tag value from the address
 290 def get_tag(addr):
 291     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 292
 293 # Read a tag from a tag memory row
 294 def read_tag(way, tagset):
 295     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 296
 297 # Read a TLB tag from a TLB tag memory row
 298 def read_tlb_tag(way, tags):
 299     return tags.word_select(way, TLB_EA_TAG_BITS)
 300
 301 # Write a TLB tag to a TLB tag memory row
 302 def write_tlb_tag(way, tags, tag):
 303     return read_tlb_tag(way, tags).eq(tag)
 304
 305 # Read a PTE from a TLB PTE memory row
 306 def read_tlb_pte(way, ptes):
 307     return ptes.word_select(way, TLB_PTE_BITS)
 308
 309 def write_tlb_pte(way, ptes, newpte):
 310     return read_tlb_pte(way, ptes).eq(newpte)
 311
 312
 313 # Record for storing permission, attribute, etc. bits from a PTE
 314 class PermAttr(RecordObject):
 315     def __init__(self, name=None):
 316         super().__init__(name=name)
 317         self.reference = Signal()
 318         self.changed   = Signal()
 319         self.nocache   = Signal()
 320         self.priv      = Signal()
 321         self.rd_perm   = Signal()
 322         self.wr_perm   = Signal()
 323
 324
 325 def extract_perm_attr(pte):
 326     pa = PermAttr()
 327     return pa;
 328
 329
 330 # Type of operation on a "valid" input
 331 @unique
 332 class Op(Enum):
 333     OP_NONE       = 0
 334     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 335     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 336     OP_LOAD_HIT   = 3 # Cache hit on load
 337     OP_LOAD_MISS  = 4 # Load missing cache
 338     OP_LOAD_NC    = 5 # Non-cachable load
 339     OP_STORE_HIT  = 6 # Store hitting cache
 340     OP_STORE_MISS = 7 # Store missing cache
 341
 342
 343 # Cache state machine
 344 @unique
 345 class State(Enum):
 346     IDLE             = 0 # Normal load hit processing
 347     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 348     STORE_WAIT_ACK   = 2 # Store wait ack
 349     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 350
 351
 352 # Dcache operations:
 353 #
 354 # In order to make timing, we use the BRAMs with
 355 # an output buffer, which means that the BRAM
 356 # output is delayed by an extra cycle.
 357 #
 358 # Thus, the dcache has a 2-stage internal pipeline
 359 # for cache hits with no stalls.
 360 #
 361 # All other operations are handled via stalling
 362 # in the first stage.
 363 #
 364 # The second stage can thus complete a hit at the same
 365 # time as the first stage emits a stall for a complex op.
 366 #
 367 # Stage 0 register, basically contains just the latched request
 368
 369 class RegStage0(RecordObject):
 370     def __init__(self, name=None):
 371         super().__init__(name=name)
 372         self.req     = LoadStore1ToDCacheType(name="lsmem")
 373         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 374         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 375         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 376         self.mmu_req = Signal() # indicates source of request
 377         self.d_valid = Signal() # indicates req.data is valid now
 378
 379
 380 class MemAccessRequest(RecordObject):
 381     def __init__(self, name=None):
 382         super().__init__(name=name)
 383         self.op        = Signal(Op)
 384         self.valid     = Signal()
 385         self.dcbz      = Signal()
 386         self.real_addr = Signal(REAL_ADDR_BITS)
 387         self.data      = Signal(64)
 388         self.byte_sel  = Signal(8)
 389         self.hit_way   = Signal(WAY_BITS)
 390         self.same_tag  = Signal()
 391         self.mmu_req   = Signal()
 392
 393
 394 # First stage register, contains state for stage 1 of load hits
 395 # and for the state machine used by all other operations
 396 class RegStage1(RecordObject):
 397     def __init__(self, name=None):
 398         super().__init__(name=name)
 399         # Info about the request
 400         self.full             = Signal() # have uncompleted request
 401         self.mmu_req          = Signal() # request is from MMU
 402         self.req              = MemAccessRequest(name="reqmem")
 403
 404         # Cache hit state
 405         self.hit_way          = Signal(WAY_BITS)
 406         self.hit_load_valid   = Signal()
 407         self.hit_index        = Signal(INDEX_BITS)
 408         self.cache_hit        = Signal()
 409
 410         # TLB hit state
 411         self.tlb_hit          = TLBHit("tlb_hit")
 412         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 413
 414         # 2-stage data buffer for data forwarded from writes to reads
 415         self.forward_data1    = Signal(64)
 416         self.forward_data2    = Signal(64)
 417         self.forward_sel1     = Signal(8)
 418         self.forward_valid1   = Signal()
 419         self.forward_way1     = Signal(WAY_BITS)
 420         self.forward_row1     = Signal(ROW_BITS)
 421         self.use_forward1     = Signal()
 422         self.forward_sel      = Signal(8)
 423
 424         # Cache miss state (reload state machine)
 425         self.state            = Signal(State)
 426         self.dcbz             = Signal()
 427         self.write_bram       = Signal()
 428         self.write_tag        = Signal()
 429         self.slow_valid       = Signal()
 430         self.wb               = WBMasterOut("wb")
 431         self.reload_tag       = Signal(TAG_BITS)
 432         self.store_way        = Signal(WAY_BITS)
 433         self.store_row        = Signal(ROW_BITS)
 434         self.store_index      = Signal(INDEX_BITS)
 435         self.end_row_ix       = Signal(ROW_LINE_BITS)
 436         self.rows_valid       = RowPerLineValidArray()
 437         self.acks_pending     = Signal(3)
 438         self.inc_acks         = Signal()
 439         self.dec_acks         = Signal()
 440
 441         # Signals to complete (possibly with error)
 442         self.ls_valid         = Signal()
 443         self.ls_error         = Signal()
 444         self.mmu_done         = Signal()
 445         self.mmu_error        = Signal()
 446         self.cache_paradox    = Signal()
 447
 448         # Signal to complete a failed stcx.
 449         self.stcx_fail        = Signal()
 450
 451
 452 # Reservation information
 453 class Reservation(RecordObject):
 454     def __init__(self, name=None):
 455         super().__init__(name=name)
 456         self.valid = Signal()
 457         self.addr  = Signal(64-LINE_OFF_BITS)
 458
 459
 460 class DTLBUpdate(Elaboratable):
 461     def __init__(self):
 462         self.tlbie    = Signal()
 463         self.tlbwe    = Signal()
 464         self.doall    = Signal()
 465         self.tlb_hit     = TLBHit("tlb_hit")
 466         self.tlb_req_index = Signal(TLB_SET_BITS)
 467
 468         self.repl_way        = Signal(TLB_WAY_BITS)
 469         self.eatag           = Signal(TLB_EA_TAG_BITS)
 470         self.pte_data        = Signal(TLB_PTE_BITS)
 471
 472         # read from dtlb array
 473         self.tlb_read       = Signal()
 474         self.tlb_read_index = Signal(TLB_SET_BITS)
 475         self.tlb_way        = TLBRecord("o_tlb_way")
 476
 477     def elaborate(self, platform):
 478         m = Module()
 479         comb = m.d.comb
 480         sync = m.d.sync
 481
 482         # there are 3 parts to this:
 483         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 484         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 485         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 486         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 487         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 488         # hmmm....
 489
 490         dtlb_valid = TLBValidArray()
 491         tlb_req_index = self.tlb_req_index
 492
 493         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 494         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 495         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 496         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 497         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 498         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 499
 500         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 501         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 502         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 503         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 504                                     granularity=TLB_EA_TAG_BITS)
 505
 506         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 507         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 508         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 509                                     granularity=TLB_PTE_BITS)
 510
 511         # commented out for now, can be put in if Memory.reset can be
 512         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 513         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 514         #m.submodules.rd_valid = rd_valid = validm.read_port()
 515         #m.submodules.wr_valid = wr_valid = validm.write_port(
 516                                     #granularity=1)
 517
 518         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 519         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 520         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 521         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 522         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 523         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 524         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 525
 526         updated  = Signal()
 527         v_updated  = Signal()
 528         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 529         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 530         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 531         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 532
 533         comb += dv.eq(dtlb_valid[tlb_req_index])
 534         comb += db_out.eq(dv)
 535
 536         with m.If(self.tlbie & self.doall):
 537             # clear all valid bits at once
 538             # XXX hmmm, validm _could_ use Memory reset here...
 539             for i in range(TLB_SET_SIZE):
 540                 sync += dtlb_valid[i].eq(0)
 541         with m.Elif(self.tlbie):
 542             # invalidate just the hit_way
 543             with m.If(self.tlb_hit.valid):
 544                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 545                 comb += v_updated.eq(1)
 546         with m.Elif(self.tlbwe):
 547             # write to the requested tag and PTE
 548             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 549             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 550             # set valid bit
 551             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 552
 553             comb += updated.eq(1)
 554             comb += v_updated.eq(1)
 555
 556         # above, sometimes valid is requested to be updated but data not
 557         # therefore split them out, here.  note the granularity thing matches
 558         # with the shift-up of the eatag/pte_data into the correct TLB way.
 559         # thus is it not necessary to write the entire lot, just the portion
 560         # being altered: hence writing the *old* copy of the row is not needed
 561         with m.If(updated): # PTE and TAG to be written
 562             comb += wr_pteway.data.eq(pb_out)
 563             comb += wr_pteway.en.eq(1<<self.repl_way)
 564             comb += wr_tagway.data.eq(tb_out)
 565             comb += wr_tagway.en.eq(1<<self.repl_way)
 566         with m.If(v_updated): # Valid to be written
 567             sync += dtlb_valid[tlb_req_index].eq(db_out)
 568             #comb += wr_valid.data.eq(db_out)
 569             #comb += wr_valid.en.eq(1<<self.repl_way)
 570
 571         # select one TLB way, use a register here
 572         r_delay = Signal()
 573         sync += r_delay.eq(self.tlb_read)
 574         # first deal with the valids, which are not in a Memory.
 575         # tlb way valid is output on a 1 clock delay with sync,
 576         # but have to explicitly deal with "forwarding" here
 577         with m.If(self.tlb_read):
 578             with m.If(v_updated): # write *and* read in same cycle: forward
 579                 sync += self.tlb_way.valid.eq(db_out)
 580             with m.Else():
 581                 sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 582         # now deal with the Memory-read case. the output must remain
 583         # valid (stable) even when a read-request is not made, but stable
 584         # on a one-clock delay, hence the register
 585         r_tlb_way        = TLBRecord("r_tlb_way")
 586         with m.If(r_delay):
 587             # on one clock delay, capture the contents of the read port(s)
 588             comb += self.tlb_way.tag.eq(rd_tagway.data)
 589             comb += self.tlb_way.pte.eq(rd_pteway.data)
 590             sync += r_tlb_way.tag.eq(rd_tagway.data)
 591             sync += r_tlb_way.pte.eq(rd_pteway.data)
 592         with m.Else():
 593             # ... so that the register can output it when no read is requested
 594             # it's rather overkill but better to be safe than sorry
 595             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 596             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 597             #comb += self.tlb_way.eq(r_tlb_way)
 598
 599         return m
 600
 601
 602 class DCachePendingHit(Elaboratable):
 603
 604     def __init__(self, tlb_way,
 605                       cache_i_validdx, cache_tag_set,
 606                     req_addr):
 607
 608         self.go          = Signal()
 609         self.virt_mode   = Signal()
 610         self.is_hit      = Signal()
 611         self.tlb_hit      = TLBHit("tlb_hit")
 612         self.hit_way     = Signal(WAY_BITS)
 613         self.rel_match   = Signal()
 614         self.req_index   = Signal(INDEX_BITS)
 615         self.reload_tag  = Signal(TAG_BITS)
 616
 617         self.tlb_way = tlb_way
 618         self.cache_i_validdx = cache_i_validdx
 619         self.cache_tag_set = cache_tag_set
 620         self.req_addr = req_addr
 621
 622     def elaborate(self, platform):
 623         m = Module()
 624         comb = m.d.comb
 625         sync = m.d.sync
 626
 627         go = self.go
 628         virt_mode = self.virt_mode
 629         is_hit = self.is_hit
 630         tlb_way = self.tlb_way
 631         cache_i_validdx = self.cache_i_validdx
 632         cache_tag_set = self.cache_tag_set
 633         req_addr = self.req_addr
 634         tlb_hit = self.tlb_hit
 635         hit_way = self.hit_way
 636         rel_match = self.rel_match
 637         req_index = self.req_index
 638         reload_tag = self.reload_tag
 639
 640         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 641                                   for i in range(TLB_NUM_WAYS))
 642         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 643                                     for i in range(TLB_NUM_WAYS))
 644         hit_way_set = HitWaySet()
 645
 646         # Test if pending request is a hit on any way
 647         # In order to make timing in virtual mode,
 648         # when we are using the TLB, we compare each
 649         # way with each of the real addresses from each way of
 650         # the TLB, and then decide later which match to use.
 651
 652         with m.If(virt_mode):
 653             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 654                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 655                 s_hit       = Signal(name="s_hit%d" % j)
 656                 s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
 657                 s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
 658                 # read the PTE, calc the Real Address, get tge tag
 659                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 660                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 661                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 662                 comb += s_tag.eq(get_tag(s_ra))
 663                 # for each way check tge tag against the cache tag set
 664                 for i in range(NUM_WAYS): # way_t
 665                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 666                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 667                                   (read_tag(i, cache_tag_set) == s_tag)
 668                                   & (tlb_way.valid[j]))
 669                     with m.If(is_tag_hit):
 670                         comb += hit_way_set[j].eq(i)
 671                         comb += s_hit.eq(1)
 672                 comb += hit_set[j].eq(s_hit)
 673                 comb += rel_matches[j].eq(s_tag == reload_tag)
 674             with m.If(tlb_hit.valid):
 675                 comb += is_hit.eq(hit_set[tlb_hit.way])
 676                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 677                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 678         with m.Else():
 679             s_tag       = Signal(TAG_BITS)
 680             comb += s_tag.eq(get_tag(req_addr))
 681             for i in range(NUM_WAYS): # way_t
 682                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 683                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 684                           (read_tag(i, cache_tag_set) == s_tag))
 685                 with m.If(is_tag_hit):
 686                     comb += hit_way.eq(i)
 687                     comb += is_hit.eq(1)
 688             with m.If(s_tag == reload_tag):
 689                 comb += rel_match.eq(1)
 690
 691         return m
 692
 693
 694 class DCache(Elaboratable):
 695     """Set associative dcache write-through
 696
 697     TODO (in no specific order):
 698     * See list in icache.vhdl
 699     * Complete load misses on the cycle when WB data comes instead of
 700       at the end of line (this requires dealing with requests coming in
 701       while not idle...)
 702     """
 703     def __init__(self, pspec=None):
 704         self.d_in      = LoadStore1ToDCacheType("d_in")
 705         self.d_out     = DCacheToLoadStore1Type("d_out")
 706
 707         self.m_in      = MMUToDCacheType("m_in")
 708         self.m_out     = DCacheToMMUType("m_out")
 709
 710         self.stall_out = Signal()
 711         self.any_stall_out = Signal()
 712         self.dreq_when_stall = Signal()
 713         self.mreq_when_stall = Signal()
 714
 715         # standard naming (wired to non-standard for compatibility)
 716         self.bus = Interface(addr_width=32,
 717                             data_width=64,
 718                             granularity=8,
 719                             features={'stall'},
 720                             alignment=0,
 721                             name="dcache")
 722
 723         self.log_out   = Signal(20)
 724
 725         # test if microwatt compatibility is to be enabled
 726         self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
 727                                  (pspec.microwatt_compat == True))
 728
 729     def stage_0(self, m, r0, r1, r0_full):
 730         """Latch the request in r0.req as long as we're not stalling
 731         """
 732         comb = m.d.comb
 733         sync = m.d.sync
 734         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 735
 736         r = RegStage0("stage0")
 737
 738         # TODO, this goes in unit tests and formal proofs
 739         with m.If(d_in.valid & m_in.valid):
 740             sync += Display("request collision loadstore vs MMU")
 741
 742         with m.If(m_in.valid):
 743             comb += r.req.valid.eq(1)
 744             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 745             comb += r.req.dcbz.eq(0)
 746             comb += r.req.nc.eq(0)
 747             comb += r.req.reserve.eq(0)
 748             comb += r.req.virt_mode.eq(0)
 749             comb += r.req.priv_mode.eq(1)
 750             comb += r.req.addr.eq(m_in.addr)
 751             comb += r.req.data.eq(m_in.pte)
 752             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 753             comb += r.tlbie.eq(m_in.tlbie)
 754             comb += r.doall.eq(m_in.doall)
 755             comb += r.tlbld.eq(m_in.tlbld)
 756             comb += r.mmu_req.eq(1)
 757             comb += r.d_valid.eq(1)
 758             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 759                                  m_in.addr, m_in.pte, r.req.load)
 760
 761         with m.Else():
 762             comb += r.req.eq(d_in)
 763             comb += r.req.data.eq(0)
 764             comb += r.tlbie.eq(0)
 765             comb += r.doall.eq(0)
 766             comb += r.tlbld.eq(0)
 767             comb += r.mmu_req.eq(0)
 768             comb += r.d_valid.eq(0)
 769
 770         sync += r0_full.eq(0)
 771         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 772             sync += r0.eq(r)
 773             sync += r0_full.eq(r.req.valid)
 774         with m.Elif(~r0.d_valid):
 775             # Sample data the cycle after a request comes in from loadstore1.
 776             # If another request has come in already then the data will get
 777             # put directly into req.data below.
 778             sync += r0.req.data.eq(d_in.data)
 779             sync += r0.d_valid.eq(1)
 780         with m.If(d_in.valid):
 781             m.d.sync += Display("    DCACHE req cache "
 782                                 "virt %d addr %x data %x ld %d",
 783                                  r.req.virt_mode, r.req.addr,
 784                                  r.req.data, r.req.load)
 785
 786     def tlb_read(self, m, r0_stall, tlb_way):
 787         """TLB
 788         Operates in the second cycle on the request latched in r0.req.
 789         TLB updates write the entry at the end of the second cycle.
 790         """
 791         comb = m.d.comb
 792         sync = m.d.sync
 793         m_in, d_in = self.m_in, self.d_in
 794
 795         addrbits = Signal(TLB_SET_BITS)
 796
 797         amin = TLB_LG_PGSZ
 798         amax = TLB_LG_PGSZ + TLB_SET_BITS
 799
 800         with m.If(m_in.valid):
 801             comb += addrbits.eq(m_in.addr[amin : amax])
 802         with m.Else():
 803             comb += addrbits.eq(d_in.addr[amin : amax])
 804
 805         # If we have any op and the previous op isn't finished,
 806         # then keep the same output for next cycle.
 807         d = self.dtlb_update
 808         comb += d.tlb_read_index.eq(addrbits)
 809         comb += d.tlb_read.eq(~r0_stall)
 810         comb += tlb_way.eq(d.tlb_way)
 811
 812     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 813         """Generate TLB PLRUs
 814         """
 815         comb = m.d.comb
 816         sync = m.d.sync
 817
 818         if TLB_NUM_WAYS == 0:
 819             return
 820
 821         # suite of PLRUs with a selection and output mechanism
 822         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 823         m.submodules.tlb_plrus = tlb_plrus
 824         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 825         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 826         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 827         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 828         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 829
 830     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 831                    tlb_way,
 832                    pte, tlb_hit, valid_ra, perm_attr, ra):
 833
 834         comb = m.d.comb
 835
 836         hitway = Signal(TLB_WAY_BITS)
 837         hit    = Signal()
 838         eatag  = Signal(TLB_EA_TAG_BITS)
 839
 840         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 841         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 842         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 843
 844         for i in range(TLB_NUM_WAYS):
 845             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 846             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 847             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 848             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 849             with m.If(is_tag_hit):
 850                 comb += hitway.eq(i)
 851                 comb += hit.eq(1)
 852
 853         comb += tlb_hit.valid.eq(hit & r0_valid)
 854         comb += tlb_hit.way.eq(hitway)
 855
 856         with m.If(tlb_hit.valid):
 857             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 858         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 859
 860         with m.If(r0.req.virt_mode):
 861             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 862                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 863                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 864             comb += perm_attr.reference.eq(pte[8])
 865             comb += perm_attr.changed.eq(pte[7])
 866             comb += perm_attr.nocache.eq(pte[5])
 867             comb += perm_attr.priv.eq(pte[3])
 868             comb += perm_attr.rd_perm.eq(pte[2])
 869             comb += perm_attr.wr_perm.eq(pte[1])
 870         with m.Else():
 871             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 872                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 873             comb += perm_attr.reference.eq(1)
 874             comb += perm_attr.changed.eq(1)
 875             comb += perm_attr.nocache.eq(0)
 876             comb += perm_attr.priv.eq(1)
 877             comb += perm_attr.rd_perm.eq(1)
 878             comb += perm_attr.wr_perm.eq(1)
 879
 880         with m.If(valid_ra):
 881             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 882                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 883             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 884             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 885             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 886             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 887             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 888             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 889
 890     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 891                     tlb_hit, tlb_plru_victim):
 892
 893         comb = m.d.comb
 894         sync = m.d.sync
 895
 896         tlbie    = Signal()
 897         tlbwe    = Signal()
 898
 899         comb += tlbie.eq(r0_valid & r0.tlbie)
 900         comb += tlbwe.eq(r0_valid & r0.tlbld)
 901
 902         d = self.dtlb_update
 903
 904         comb += d.tlbie.eq(tlbie)
 905         comb += d.tlbwe.eq(tlbwe)
 906         comb += d.doall.eq(r0.doall)
 907         comb += d.tlb_hit.eq(tlb_hit)
 908         comb += d.tlb_req_index.eq(tlb_req_index)
 909
 910         with m.If(tlb_hit.valid):
 911             comb += d.repl_way.eq(tlb_hit.way)
 912         with m.Else():
 913             comb += d.repl_way.eq(tlb_plru_victim)
 914         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 915         comb += d.pte_data.eq(r0.req.data)
 916
 917     def maybe_plrus(self, m, r1, plru_victim):
 918         """Generate PLRUs
 919         """
 920         comb = m.d.comb
 921         sync = m.d.sync
 922
 923         if TLB_NUM_WAYS == 0:
 924             return
 925
 926         # suite of PLRUs with a selection and output mechanism
 927         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 928         comb += plrus.way.eq(r1.hit_way)
 929         comb += plrus.valid.eq(r1.cache_hit)
 930         comb += plrus.index.eq(r1.hit_index)
 931         comb += plrus.isel.eq(r1.store_index) # select victim
 932         comb += plru_victim.eq(plrus.o_index) # selected victim
 933
 934     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set,
 935                             cache_tags):
 936         """Cache tag RAM read port
 937         """
 938         comb = m.d.comb
 939         sync = m.d.sync
 940         m_in, d_in = self.m_in, self.d_in
 941
 942         index = Signal(INDEX_BITS)
 943
 944         with m.If(r0_stall):
 945             comb += index.eq(req_index)
 946         with m.Elif(m_in.valid):
 947             comb += index.eq(get_index(m_in.addr))
 948         with m.Else():
 949             comb += index.eq(get_index(d_in.addr))
 950         sync += cache_tag_set.eq(cache_tags[index])
 951
 952     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 953                        r0_valid, r1, cache_tags, cache_valids, replace_way,
 954                        use_forward1_next, use_forward2_next,
 955                        req_hit_way, plru_victim, rc_ok, perm_attr,
 956                        valid_ra, perm_ok, access_ok, req_op, req_go,
 957                        tlb_hit, tlb_way, cache_tag_set,
 958                        cancel_store, req_same_tag, r0_stall, early_req_row):
 959         """Cache request parsing and hit detection
 960         """
 961
 962         comb = m.d.comb
 963         m_in, d_in = self.m_in, self.d_in
 964
 965         is_hit      = Signal()
 966         hit_way     = Signal(WAY_BITS)
 967         op          = Signal(Op)
 968         opsel       = Signal(3)
 969         go          = Signal()
 970         nc          = Signal()
 971         cache_i_validdx = Signal(NUM_WAYS)
 972
 973         # Extract line, row and tag from request
 974         comb += req_index.eq(get_index(r0.req.addr))
 975         comb += req_row.eq(get_row(r0.req.addr))
 976         comb += req_tag.eq(get_tag(ra))
 977
 978         if False: # display on comb is a bit... busy.
 979             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 980                     r0.req.addr, ra, req_index, req_tag, req_row)
 981
 982         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 983         comb += cache_i_validdx.eq(cache_valids[req_index])
 984
 985         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 986                                             cache_i_validdx, cache_tag_set,
 987                                             r0.req.addr)
 988         comb += dc.tlb_hit.eq(tlb_hit)
 989         comb += dc.reload_tag.eq(r1.reload_tag)
 990         comb += dc.virt_mode.eq(r0.req.virt_mode)
 991         comb += dc.go.eq(go)
 992         comb += dc.req_index.eq(req_index)
 993
 994         comb += is_hit.eq(dc.is_hit)
 995         comb += hit_way.eq(dc.hit_way)
 996         comb += req_same_tag.eq(dc.rel_match)
 997
 998         # See if the request matches the line currently being reloaded
 999         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
1000                   (req_index == r1.store_index) & req_same_tag):
1001             # For a store, consider this a hit even if the row isn't
1002             # valid since it will be by the time we perform the store.
1003             # For a load, check the appropriate row valid bit.
1004             rrow = Signal(ROW_LINE_BITS)
1005             comb += rrow.eq(req_row)
1006             valid = r1.rows_valid[rrow]
1007             comb += is_hit.eq((~r0.req.load) | valid)
1008             comb += hit_way.eq(replace_way)
1009
1010         # Whether to use forwarded data for a load or not
1011         with m.If((get_row(r1.req.real_addr) == req_row) &
1012                   (r1.req.hit_way == hit_way)):
1013             # Only need to consider r1.write_bram here, since if we
1014             # are writing refill data here, then we don't have a
1015             # cache hit this cycle on the line being refilled.
1016             # (There is the possibility that the load following the
1017             # load miss that started the refill could be to the old
1018             # contents of the victim line, since it is a couple of
1019             # cycles after the refill starts before we see the updated
1020             # cache tag. In that case we don't use the bypass.)
1021             comb += use_forward1_next.eq(r1.write_bram)
1022         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
1023             comb += use_forward2_next.eq(r1.forward_valid1)
1024
1025         # The way that matched on a hit
1026         comb += req_hit_way.eq(hit_way)
1027
1028         # The way to replace on a miss
1029         with m.If(r1.write_tag):
1030             comb += replace_way.eq(plru_victim)
1031         with m.Else():
1032             comb += replace_way.eq(r1.store_way)
1033
1034         # work out whether we have permission for this access
1035         # NB we don't yet implement AMR, thus no KUAP
1036         comb += rc_ok.eq(perm_attr.reference
1037                          & (r0.req.load | perm_attr.changed))
1038         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1039                            (perm_attr.wr_perm |
1040                               (r0.req.load & perm_attr.rd_perm)))
1041         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1042
1043         # Combine the request and cache hit status to decide what
1044         # operation needs to be done
1045         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1046         comb += op.eq(Op.OP_NONE)
1047         with m.If(go):
1048             with m.If(~access_ok):
1049                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1050                                  valid_ra, perm_ok, rc_ok)
1051                 comb += op.eq(Op.OP_BAD)
1052             with m.Elif(cancel_store):
1053                 m.d.sync += Display("DCACHE cancel store")
1054                 comb += op.eq(Op.OP_STCX_FAIL)
1055             with m.Else():
1056                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1057                                  valid_ra, nc, r0.req.load)
1058                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1059                 with m.Switch(opsel):
1060                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1061                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1062                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1063                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1064                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1065                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1066                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1067                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1068         comb += req_op.eq(op)
1069         comb += req_go.eq(go)
1070
1071         # Version of the row number that is valid one cycle earlier
1072         # in the cases where we need to read the cache data BRAM.
1073         # If we're stalling then we need to keep reading the last
1074         # row requested.
1075         with m.If(~r0_stall):
1076             with m.If(m_in.valid):
1077                 comb += early_req_row.eq(get_row(m_in.addr))
1078             with m.Else():
1079                 comb += early_req_row.eq(get_row(d_in.addr))
1080         with m.Else():
1081             comb += early_req_row.eq(req_row)
1082
1083     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1084                          r0_valid, r0, reservation):
1085         """Handle load-with-reservation and store-conditional instructions
1086         """
1087         comb = m.d.comb
1088
1089         with m.If(r0_valid & r0.req.reserve):
1090             # XXX generate alignment interrupt if address
1091             # is not aligned XXX or if r0.req.nc = '1'
1092             with m.If(r0.req.load):
1093                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1094             with m.Else():
1095                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1096                 with m.If((~reservation.valid) |
1097                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1098                     comb += cancel_store.eq(1)
1099
1100     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1101                         reservation, r0):
1102         comb = m.d.comb
1103         sync = m.d.sync
1104
1105         with m.If(r0_valid & access_ok):
1106             with m.If(clear_rsrv):
1107                 sync += reservation.valid.eq(0)
1108             with m.Elif(set_rsrv):
1109                 sync += reservation.valid.eq(1)
1110                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1111
1112     def writeback_control(self, m, r1, cache_out_row):
1113         """Return data for loads & completion control logic
1114         """
1115         comb = m.d.comb
1116         sync = m.d.sync
1117         d_out, m_out = self.d_out, self.m_out
1118
1119         data_out = Signal(64)
1120         data_fwd = Signal(64)
1121
1122         # Use the bypass if are reading the row that was
1123         # written 1 or 2 cycles ago, including for the
1124         # slow_valid = 1 case (i.e. completing a load
1125         # miss or a non-cacheable load).
1126         with m.If(r1.use_forward1):
1127             comb += data_fwd.eq(r1.forward_data1)
1128         with m.Else():
1129             comb += data_fwd.eq(r1.forward_data2)
1130
1131         comb += data_out.eq(cache_out_row)
1132
1133         for i in range(8):
1134             with m.If(r1.forward_sel[i]):
1135                 dsel = data_fwd.word_select(i, 8)
1136                 comb += data_out.word_select(i, 8).eq(dsel)
1137
1138         # DCache output to LoadStore
1139         comb += d_out.valid.eq(r1.ls_valid)
1140         comb += d_out.data.eq(data_out)
1141         comb += d_out.store_done.eq(~r1.stcx_fail)
1142         comb += d_out.error.eq(r1.ls_error)
1143         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1144
1145         # Outputs to MMU
1146         comb += m_out.done.eq(r1.mmu_done)
1147         comb += m_out.err.eq(r1.mmu_error)
1148         comb += m_out.data.eq(data_out)
1149
1150         # We have a valid load or store hit or we just completed
1151         # a slow op such as a load miss, a NC load or a store
1152         #
1153         # Note: the load hit is delayed by one cycle. However it
1154         # can still not collide with r.slow_valid (well unless I
1155         # miscalculated) because slow_valid can only be set on a
1156         # subsequent request and not on its first cycle (the state
1157         # machine must have advanced), which makes slow_valid
1158         # at least 2 cycles from the previous hit_load_valid.
1159
1160         # Sanity: Only one of these must be set in any given cycle
1161
1162         if False: # TODO: need Display to get this to work
1163             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1164             "unexpected slow_valid collision with stcx_fail"
1165
1166             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1167              "unexpected hit_load_delayed collision with slow_valid"
1168
1169         with m.If(~r1.mmu_req):
1170             # Request came from loadstore1...
1171             # Load hit case is the standard path
1172             with m.If(r1.hit_load_valid):
1173                 sync += Display("completing load hit data=%x", data_out)
1174
1175             # error cases complete without stalling
1176             with m.If(r1.ls_error):
1177                 with m.If(r1.dcbz):
1178                     sync += Display("completing dcbz with error")
1179                 with m.Else():
1180                     sync += Display("completing ld/st with error")
1181
1182             # Slow ops (load miss, NC, stores)
1183             with m.If(r1.slow_valid):
1184                 sync += Display("completing store or load miss adr=%x data=%x",
1185                                 r1.req.real_addr, data_out)
1186
1187         with m.Else():
1188             # Request came from MMU
1189             with m.If(r1.hit_load_valid):
1190                 sync += Display("completing load hit to MMU, data=%x",
1191                                 m_out.data)
1192             # error cases complete without stalling
1193             with m.If(r1.mmu_error):
1194                 sync += Display("combpleting MMU ld with error")
1195
1196             # Slow ops (i.e. load miss)
1197             with m.If(r1.slow_valid):
1198                 sync += Display("completing MMU load miss, adr=%x data=%x",
1199                                 r1.req.real_addr, m_out.data)
1200
1201     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1202         """rams
1203         Generate a cache RAM for each way. This handles the normal
1204         reads, writes from reloads and the special store-hit update
1205         path as well.
1206
1207         Note: the BRAMs have an extra read buffer, meaning the output
1208         is pipelined an extra cycle. This differs from the
1209         icache. The writeback logic needs to take that into
1210         account by using 1-cycle delayed signals for load hits.
1211         """
1212         comb = m.d.comb
1213         bus = self.bus
1214
1215         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1216         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1217         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1218         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1219                    ~r1.write_bram))
1220         comb += rwe.i.eq(replace_way)
1221
1222         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1223         comb += hwe.i.eq(r1.hit_way)
1224
1225         # this one is gated with write_bram, and replace_way_e can never be
1226         # set at the same time.  that means that do_write can OR the outputs
1227         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1228         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1229         comb += hre.i.eq(r1.req.hit_way)
1230
1231         # common Signals
1232         do_read  = Signal()
1233         wr_addr  = Signal(ROW_BITS)
1234         wr_data  = Signal(WB_DATA_BITS)
1235         wr_sel   = Signal(ROW_SIZE)
1236         rd_addr  = Signal(ROW_BITS)
1237
1238         comb += do_read.eq(1) # always enable
1239         comb += rd_addr.eq(early_req_row)
1240
1241         # Write mux:
1242         #
1243         # Defaults to wishbone read responses (cache refill)
1244         #
1245         # For timing, the mux on wr_data/sel/addr is not
1246         # dependent on anything other than the current state.
1247
1248         with m.If(r1.write_bram):
1249             # Write store data to BRAM.  This happens one
1250             # cycle after the store is in r0.
1251             comb += wr_data.eq(r1.req.data)
1252             comb += wr_sel.eq(r1.req.byte_sel)
1253             comb += wr_addr.eq(get_row(r1.req.real_addr))
1254
1255         with m.Else():
1256             # Otherwise, we might be doing a reload or a DCBZ
1257             with m.If(r1.dcbz):
1258                 comb += wr_data.eq(0)
1259             with m.Else():
1260                 comb += wr_data.eq(bus.dat_r)
1261             comb += wr_addr.eq(r1.store_row)
1262             comb += wr_sel.eq(~0) # all 1s
1263
1264         # set up Cache Rams
1265         for i in range(NUM_WAYS):
1266             do_write = Signal(name="do_wr%d" % i)
1267             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1268             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1269
1270             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1271             m.submodules["cacheram_%d" % i] = way
1272
1273             comb += way.rd_en.eq(do_read)
1274             comb += way.rd_addr.eq(rd_addr)
1275             comb += d_out.eq(way.rd_data_o)
1276             comb += way.wr_sel.eq(wr_sel_m)
1277             comb += way.wr_addr.eq(wr_addr)
1278             comb += way.wr_data.eq(wr_data)
1279
1280             # Cache hit reads
1281             with m.If(hwe.o[i]):
1282                 comb += cache_out_row.eq(d_out)
1283
1284             # these are mutually-exclusive via their Decoder-enablers
1285             # (note: Decoder-enable is inverted)
1286             comb += do_write.eq(hre.o[i] | rwe.o[i])
1287
1288             # Mask write selects with do_write since BRAM
1289             # doesn't have a global write-enable
1290             with m.If(do_write):
1291                 comb += wr_sel_m.eq(wr_sel)
1292
1293     # Cache hit synchronous machine for the easy case.
1294     # This handles load hits.
1295     # It also handles error cases (TLB miss, cache paradox)
1296     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1297                         req_hit_way, req_index, req_tag, access_ok,
1298                         tlb_hit, tlb_req_index):
1299         comb = m.d.comb
1300         sync = m.d.sync
1301
1302         with m.If(req_op != Op.OP_NONE):
1303             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1304                     req_op, r0.req.addr, r0.req.nc,
1305                     req_index, req_tag, req_hit_way)
1306
1307         with m.If(r0_valid):
1308             sync += r1.mmu_req.eq(r0.mmu_req)
1309
1310         # Fast path for load/store hits.
1311         # Set signals for the writeback controls.
1312         sync += r1.hit_way.eq(req_hit_way)
1313         sync += r1.hit_index.eq(req_index)
1314
1315         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1316         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1317                                 (req_op == Op.OP_STORE_HIT))
1318
1319         with m.If(req_op == Op.OP_BAD):
1320             sync += Display("Signalling ld/st error "
1321                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1322                             ~r0.mmu_req,r0.mmu_req,access_ok)
1323             sync += r1.ls_error.eq(~r0.mmu_req)
1324             sync += r1.mmu_error.eq(r0.mmu_req)
1325             sync += r1.cache_paradox.eq(access_ok)
1326         with m.Else():
1327             sync += r1.ls_error.eq(0)
1328             sync += r1.mmu_error.eq(0)
1329             sync += r1.cache_paradox.eq(0)
1330
1331         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1332
1333         # Record TLB hit information for updating TLB PLRU
1334         sync += r1.tlb_hit.eq(tlb_hit)
1335         sync += r1.tlb_hit_index.eq(tlb_req_index)
1336
1337     # Memory accesses are handled by this state machine:
1338     #
1339     #   * Cache load miss/reload (in conjunction with "rams")
1340     #   * Load hits for non-cachable forms
1341     #   * Stores (the collision case is handled in "rams")
1342     #
1343     # All wishbone requests generation is done here.
1344     # This machine operates at stage 1.
1345     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1346                     r0, replace_way,
1347                     req_hit_way, req_same_tag,
1348                     r0_valid, req_op, cache_tags, cache_valids, req_go, ra):
1349
1350         comb = m.d.comb
1351         sync = m.d.sync
1352         bus = self.bus
1353         d_in = self.d_in
1354
1355         req         = MemAccessRequest("mreq_ds")
1356
1357         r1_next_cycle = Signal()
1358         req_row = Signal(ROW_BITS)
1359         req_idx = Signal(INDEX_BITS)
1360         req_tag = Signal(TAG_BITS)
1361         comb += req_idx.eq(get_index(req.real_addr))
1362         comb += req_row.eq(get_row(req.real_addr))
1363         comb += req_tag.eq(get_tag(req.real_addr))
1364
1365         sync += r1.use_forward1.eq(use_forward1_next)
1366         sync += r1.forward_sel.eq(0)
1367
1368         with m.If(use_forward1_next):
1369             sync += r1.forward_sel.eq(r1.req.byte_sel)
1370         with m.Elif(use_forward2_next):
1371             sync += r1.forward_sel.eq(r1.forward_sel1)
1372
1373         sync += r1.forward_data2.eq(r1.forward_data1)
1374         with m.If(r1.write_bram):
1375             sync += r1.forward_data1.eq(r1.req.data)
1376             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1377             sync += r1.forward_way1.eq(r1.req.hit_way)
1378             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1379             sync += r1.forward_valid1.eq(1)
1380         with m.Else():
1381             with m.If(r1.dcbz):
1382                 sync += r1.forward_data1.eq(0)
1383             with m.Else():
1384                 sync += r1.forward_data1.eq(bus.dat_r)
1385             sync += r1.forward_sel1.eq(~0) # all 1s
1386             sync += r1.forward_way1.eq(replace_way)
1387             sync += r1.forward_row1.eq(r1.store_row)
1388             sync += r1.forward_valid1.eq(0)
1389
1390         # One cycle pulses reset
1391         sync += r1.slow_valid.eq(0)
1392         sync += r1.write_bram.eq(0)
1393         sync += r1.inc_acks.eq(0)
1394         sync += r1.dec_acks.eq(0)
1395
1396         sync += r1.ls_valid.eq(0)
1397         # complete tlbies and TLB loads in the third cycle
1398         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1399
1400         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1401             with m.If(r0.mmu_req):
1402                 sync += r1.mmu_done.eq(1)
1403             with m.Else():
1404                 sync += r1.ls_valid.eq(1)
1405
1406         with m.If(r1.write_tag):
1407             # Store new tag in selected way
1408             replace_way_onehot = Signal(NUM_WAYS)
1409             comb += replace_way_onehot.eq(1<<replace_way)
1410             for i in range(NUM_WAYS):
1411                 with m.If(replace_way_onehot[i]):
1412                     ct = Signal(TAG_RAM_WIDTH)
1413                     comb += ct.eq(cache_tags[r1.store_index])
1414                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1415                     sync += cache_tags[r1.store_index].eq(ct)
1416             sync += r1.store_way.eq(replace_way)
1417             sync += r1.write_tag.eq(0)
1418
1419         # Take request from r1.req if there is one there,
1420         # else from req_op, ra, etc.
1421         with m.If(r1.full):
1422             comb += req.eq(r1.req)
1423         with m.Else():
1424             comb += req.op.eq(req_op)
1425             comb += req.valid.eq(req_go)
1426             comb += req.mmu_req.eq(r0.mmu_req)
1427             comb += req.dcbz.eq(r0.req.dcbz)
1428             comb += req.real_addr.eq(ra)
1429
1430             with m.If(r0.req.dcbz):
1431                 # force data to 0 for dcbz
1432                 comb += req.data.eq(0)
1433             with m.Elif(r0.d_valid):
1434                 comb += req.data.eq(r0.req.data)
1435             with m.Else():
1436                 comb += req.data.eq(d_in.data)
1437
1438             # Select all bytes for dcbz
1439             # and for cacheable loads
1440             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1441                 comb += req.byte_sel.eq(~0) # all 1s
1442             with m.Else():
1443                 comb += req.byte_sel.eq(r0.req.byte_sel)
1444             comb += req.hit_way.eq(req_hit_way)
1445             comb += req.same_tag.eq(req_same_tag)
1446
1447             # Store the incoming request from r0,
1448             # if it is a slow request
1449             # Note that r1.full = 1 implies req_op = OP_NONE
1450             with m.If((req_op == Op.OP_LOAD_MISS)
1451                       | (req_op == Op.OP_LOAD_NC)
1452                       | (req_op == Op.OP_STORE_MISS)
1453                       | (req_op == Op.OP_STORE_HIT)):
1454                 sync += r1.req.eq(req)
1455                 sync += r1.full.eq(1)
1456                 # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
1457                 # destroy r1.req by overwriting r1.full back to zero
1458                 comb += r1_next_cycle.eq(1)
1459
1460         # Main state machine
1461         with m.Switch(r1.state):
1462
1463             with m.Case(State.IDLE):
1464                 sync += r1.wb.adr.eq(req.real_addr[ROW_OFF_BITS:])
1465                 sync += r1.wb.sel.eq(req.byte_sel)
1466                 sync += r1.wb.dat.eq(req.data)
1467                 sync += r1.dcbz.eq(req.dcbz)
1468
1469                 # Keep track of our index and way
1470                 # for subsequent stores.
1471                 sync += r1.store_index.eq(req_idx)
1472                 sync += r1.store_row.eq(req_row)
1473                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1474                 sync += r1.reload_tag.eq(req_tag)
1475                 sync += r1.req.same_tag.eq(1)
1476
1477                 with m.If(req.op == Op.OP_STORE_HIT):
1478                     sync += r1.store_way.eq(req.hit_way)
1479
1480                 #with m.If(r1.dec_acks):
1481                 #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
1482
1483                 # Reset per-row valid bits,
1484                 # ready for handling OP_LOAD_MISS
1485                 for i in range(ROW_PER_LINE):
1486                     sync += r1.rows_valid[i].eq(0)
1487
1488                 with m.If(req_op != Op.OP_NONE):
1489                     sync += Display("cache op %d", req.op)
1490
1491                 with m.Switch(req.op):
1492                     with m.Case(Op.OP_LOAD_HIT):
1493                         # stay in IDLE state
1494                         pass
1495
1496                     with m.Case(Op.OP_LOAD_MISS):
1497                         sync += Display("cache miss real addr: %x " \
1498                                 "idx: %x tag: %x",
1499                                 req.real_addr, req_row, req_tag)
1500
1501                         # Start the wishbone cycle
1502                         sync += r1.wb.we.eq(0)
1503                         sync += r1.wb.cyc.eq(1)
1504                         sync += r1.wb.stb.eq(1)
1505
1506                         # Track that we had one request sent
1507                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1508                         sync += r1.write_tag.eq(1)
1509
1510                     with m.Case(Op.OP_LOAD_NC):
1511                         sync += r1.wb.cyc.eq(1)
1512                         sync += r1.wb.stb.eq(1)
1513                         sync += r1.wb.we.eq(0)
1514                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1515
1516                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1517                         with m.If(~req.dcbz):
1518                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1519                             sync += r1.acks_pending.eq(1)
1520                             sync += r1.full.eq(0)
1521                             comb += r1_next_cycle.eq(0)
1522                             sync += r1.slow_valid.eq(1)
1523
1524                             with m.If(req.mmu_req):
1525                                 sync += r1.mmu_done.eq(1)
1526                             with m.Else():
1527                                 sync += r1.ls_valid.eq(1)
1528
1529                             with m.If(req.op == Op.OP_STORE_HIT):
1530                                 sync += r1.write_bram.eq(1)
1531                         with m.Else():
1532                             # dcbz is handled much like a load miss except
1533                             # that we are writing to memory instead of reading
1534                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1535
1536                             with m.If(req.op == Op.OP_STORE_MISS):
1537                                 sync += r1.write_tag.eq(1)
1538
1539                         sync += r1.wb.we.eq(1)
1540                         sync += r1.wb.cyc.eq(1)
1541                         sync += r1.wb.stb.eq(1)
1542
1543                     # OP_NONE and OP_BAD do nothing
1544                     # OP_BAD & OP_STCX_FAIL were
1545                     # handled above already
1546                     with m.Case(Op.OP_NONE):
1547                         pass
1548                     with m.Case(Op.OP_BAD):
1549                         pass
1550                     with m.Case(Op.OP_STCX_FAIL):
1551                         pass
1552
1553             with m.Case(State.RELOAD_WAIT_ACK):
1554                 ld_stbs_done = Signal()
1555                 # Requests are all sent if stb is 0
1556                 comb += ld_stbs_done.eq(~r1.wb.stb)
1557
1558                 # If we are still sending requests, was one accepted?
1559                 with m.If((~bus.stall) & r1.wb.stb):
1560                     # That was the last word?  We are done sending.
1561                     # Clear stb and set ld_stbs_done so we can handle an
1562                     # eventual last ack on the same cycle.
1563                     # sigh - reconstruct wb adr with 3 extra 0s at front
1564                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1565                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1566                         sync += r1.wb.stb.eq(0)
1567                         comb += ld_stbs_done.eq(1)
1568
1569                     # Calculate the next row address in the current cache line
1570                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1571                     comb += row.eq(r1.wb.adr)
1572                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1573
1574                 # Incoming acks processing
1575                 sync += r1.forward_valid1.eq(bus.ack)
1576                 with m.If(bus.ack):
1577                     srow = Signal(ROW_LINE_BITS)
1578                     comb += srow.eq(r1.store_row)
1579                     sync += r1.rows_valid[srow].eq(1)
1580
1581                     # If this is the data we were looking for,
1582                     # we can complete the request next cycle.
1583                     # Compare the whole address in case the
1584                     # request in r1.req is not the one that
1585                     # started this refill.
1586                     with m.If(r1.full & r1.req.same_tag &
1587                               ((r1.dcbz & req.dcbz) |
1588                                (r1.req.op == Op.OP_LOAD_MISS)) &
1589                                 (r1.store_row == get_row(r1.req.real_addr))):
1590                         sync += r1.full.eq(r1_next_cycle)
1591                         sync += r1.slow_valid.eq(1)
1592                         with m.If(r1.mmu_req):
1593                             sync += r1.mmu_done.eq(1)
1594                         with m.Else():
1595                             sync += r1.ls_valid.eq(1)
1596                         sync += r1.forward_sel.eq(~0) # all 1s
1597                         sync += r1.use_forward1.eq(1)
1598
1599                     # Check for completion
1600                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1601                                                       r1.end_row_ix)):
1602                         # Complete wishbone cycle
1603                         sync += r1.wb.cyc.eq(0)
1604
1605                         # Cache line is now valid
1606                         cv = Signal(INDEX_BITS)
1607                         comb += cv.eq(cache_valids[r1.store_index])
1608                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1609                         sync += cache_valids[r1.store_index].eq(cv)
1610
1611                         sync += r1.state.eq(State.IDLE)
1612                         sync += Display("cache valid set %x "
1613                                         "idx %d way %d",
1614                                          cv, r1.store_index, r1.store_way)
1615
1616                     # Increment store row counter
1617                     sync += r1.store_row.eq(next_row(r1.store_row))
1618
1619             with m.Case(State.STORE_WAIT_ACK):
1620                 st_stbs_done = Signal()
1621                 adjust_acks = Signal(3)
1622
1623                 comb += st_stbs_done.eq(~r1.wb.stb)
1624
1625                 with m.If(r1.inc_acks != r1.dec_acks):
1626                     with m.If(r1.inc_acks):
1627                         comb += adjust_acks.eq(r1.acks_pending + 1)
1628                     with m.Else():
1629                         comb += adjust_acks.eq(r1.acks_pending - 1)
1630                 with m.Else():
1631                     comb += adjust_acks.eq(r1.acks_pending)
1632
1633                 sync += r1.acks_pending.eq(adjust_acks)
1634
1635                 # Clear stb when slave accepted request
1636                 with m.If(~bus.stall):
1637                     # See if there is another store waiting
1638                     # to be done which is in the same real page.
1639                     # (this is when same_tsg is true)
1640                     with m.If(req.valid):
1641                         _ra = req.real_addr[ROW_OFF_BITS:SET_SIZE_BITS]
1642                         sync += r1.wb.adr[0:SET_SIZE_BITS-ROW_OFF_BITS].eq(_ra)
1643                         sync += r1.wb.dat.eq(req.data)
1644                         sync += r1.wb.sel.eq(req.byte_sel)
1645
1646                     with m.If((adjust_acks < 7) & req.same_tag &
1647                                 ((req.op == Op.OP_STORE_MISS) |
1648                                  (req.op == Op.OP_STORE_HIT))):
1649                         sync += r1.wb.stb.eq(1)
1650                         comb += st_stbs_done.eq(0)
1651                         sync += r1.store_way.eq(req.hit_way)
1652                         sync += r1.store_row.eq(get_row(req.real_addr))
1653
1654                         with m.If(req.op == Op.OP_STORE_HIT):
1655                             sync += r1.write_bram.eq(1)
1656                         sync += r1.full.eq(r1_next_cycle)
1657                         sync += r1.slow_valid.eq(1)
1658
1659                         # Store requests never come from the MMU
1660                         sync += r1.ls_valid.eq(1)
1661                         comb += st_stbs_done.eq(0)
1662                         sync += r1.inc_acks.eq(1)
1663                     with m.Else():
1664                         sync += r1.wb.stb.eq(0)
1665                         comb += st_stbs_done.eq(1)
1666
1667                 # Got ack ? See if complete.
1668                 sync += Display("got ack %d %d stbs %d adjust_acks %d",
1669                                 bus.ack, bus.ack, st_stbs_done, adjust_acks)
1670                 with m.If(bus.ack):
1671                     with m.If(st_stbs_done & (adjust_acks == 1)):
1672                         sync += r1.state.eq(State.IDLE)
1673                         sync += r1.wb.cyc.eq(0)
1674                         sync += r1.wb.stb.eq(0)
1675                     sync += r1.dec_acks.eq(1)
1676
1677             with m.Case(State.NC_LOAD_WAIT_ACK):
1678                 # Clear stb when slave accepted request
1679                 with m.If(~bus.stall):
1680                     sync += r1.wb.stb.eq(0)
1681
1682                 # Got ack ? complete.
1683                 with m.If(bus.ack):
1684                     sync += r1.state.eq(State.IDLE)
1685                     sync += r1.full.eq(r1_next_cycle)
1686                     sync += r1.slow_valid.eq(1)
1687
1688                     with m.If(r1.mmu_req):
1689                         sync += r1.mmu_done.eq(1)
1690                     with m.Else():
1691                         sync += r1.ls_valid.eq(1)
1692
1693                     sync += r1.forward_sel.eq(~0) # all 1s
1694                     sync += r1.use_forward1.eq(1)
1695                     sync += r1.wb.cyc.eq(0)
1696                     sync += r1.wb.stb.eq(0)
1697
1698     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1699
1700         sync = m.d.sync
1701         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1702
1703         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1704                                stall_out, req_op[:3], d_out.valid, d_out.error,
1705                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1706                                r1.real_adr[3:6]))
1707
1708     def elaborate(self, platform):
1709
1710         m = Module()
1711         comb, sync = m.d.comb, m.d.sync
1712         m_in, d_in = self.m_in, self.d_in
1713
1714         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1715         cache_tags       = CacheTagArray()
1716         cache_valids     = CacheValidsArray()
1717         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1718
1719         # TODO attribute ram_style : string;
1720         # TODO attribute ram_style of cache_tags : signal is "distributed";
1721
1722         """note: these are passed to nmigen.hdl.Memory as "attributes".
1723            don't know how, just that they are.
1724         """
1725         # TODO attribute ram_style of
1726         #  dtlb_tags : signal is "distributed";
1727         # TODO attribute ram_style of
1728         #  dtlb_ptes : signal is "distributed";
1729
1730         r0      = RegStage0("r0")
1731         r0_full = Signal()
1732
1733         r1 = RegStage1("r1")
1734
1735         reservation = Reservation("rsrv")
1736
1737         # Async signals on incoming request
1738         req_index    = Signal(INDEX_BITS)
1739         req_row      = Signal(ROW_BITS)
1740         req_hit_way  = Signal(WAY_BITS)
1741         req_tag      = Signal(TAG_BITS)
1742         req_op       = Signal(Op)
1743         req_data     = Signal(64)
1744         req_same_tag = Signal()
1745         req_go       = Signal()
1746
1747         early_req_row     = Signal(ROW_BITS)
1748
1749         cancel_store      = Signal()
1750         set_rsrv          = Signal()
1751         clear_rsrv        = Signal()
1752
1753         r0_valid          = Signal()
1754         r0_stall          = Signal()
1755
1756         use_forward1_next = Signal()
1757         use_forward2_next = Signal()
1758
1759         cache_out_row     = Signal(WB_DATA_BITS)
1760
1761         plru_victim       = Signal(WAY_BITS)
1762         replace_way       = Signal(WAY_BITS)
1763
1764         # Wishbone read/write/cache write formatting signals
1765         bus_sel           = Signal(8)
1766
1767         # TLB signals
1768         tlb_way       = TLBRecord("tlb_way")
1769         tlb_req_index = Signal(TLB_SET_BITS)
1770         tlb_hit       = TLBHit("tlb_hit")
1771         pte           = Signal(TLB_PTE_BITS)
1772         ra            = Signal(REAL_ADDR_BITS)
1773         valid_ra      = Signal()
1774         perm_attr     = PermAttr("dc_perms")
1775         rc_ok         = Signal()
1776         perm_ok       = Signal()
1777         access_ok     = Signal()
1778
1779         tlb_plru_victim = Signal(TLB_WAY_BITS)
1780
1781         # we don't yet handle collisions between loadstore1 requests
1782         # and MMU requests
1783         comb += self.m_out.stall.eq(0)
1784
1785         # Hold off the request in r0 when r1 has an uncompleted request
1786         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1787         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1788         comb += self.stall_out.eq(r0_stall)
1789         # debugging: detect if any stall ever requested, which is fine,
1790         # but if a request comes in when stall requested, that's bad.
1791         with m.If(r0_stall):
1792             sync += self.any_stall_out.eq(1)
1793             with m.If(d_in.valid):
1794                 sync += self.dreq_when_stall.eq(1)
1795             with m.If(m_in.valid):
1796                 sync += self.mreq_when_stall.eq(1)
1797
1798         # deal with litex not doing wishbone pipeline mode
1799         # XXX in wrong way.  FIFOs are needed in the SRAM test
1800         # so that stb/ack match up. same thing done in icache.py
1801         if not self.microwatt_compat:
1802             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1803
1804         # Wire up wishbone request latch out of stage 1
1805         comb += self.bus.we.eq(r1.wb.we)
1806         comb += self.bus.adr.eq(r1.wb.adr)
1807         comb += self.bus.sel.eq(r1.wb.sel)
1808         comb += self.bus.stb.eq(r1.wb.stb)
1809         comb += self.bus.dat_w.eq(r1.wb.dat)
1810         comb += self.bus.cyc.eq(r1.wb.cyc)
1811
1812         # create submodule TLBUpdate
1813         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1814
1815         # call sub-functions putting everything together, using shared
1816         # signals established above
1817         self.stage_0(m, r0, r1, r0_full)
1818         self.tlb_read(m, r0_stall, tlb_way)
1819         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1820                         tlb_way,
1821                         pte, tlb_hit, valid_ra, perm_attr, ra)
1822         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1823                         tlb_hit, tlb_plru_victim)
1824         self.maybe_plrus(m, r1, plru_victim)
1825         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1826         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set,
1827                             cache_tags)
1828         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1829                            r0_valid, r1, cache_tags, cache_valids, replace_way,
1830                            use_forward1_next, use_forward2_next,
1831                            req_hit_way, plru_victim, rc_ok, perm_attr,
1832                            valid_ra, perm_ok, access_ok, req_op, req_go,
1833                            tlb_hit, tlb_way, cache_tag_set,
1834                            cancel_store, req_same_tag, r0_stall, early_req_row)
1835         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1836                            r0_valid, r0, reservation)
1837         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1838                            reservation, r0)
1839         self.writeback_control(m, r1, cache_out_row)
1840         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1841         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1842                         req_hit_way, req_index, req_tag, access_ok,
1843                         tlb_hit, tlb_req_index)
1844         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1845                     r0, replace_way,
1846                     req_hit_way, req_same_tag,
1847                          r0_valid, req_op, cache_tags, cache_valids, req_go, ra)
1848         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1849
1850         return m
1851
1852
1853 if __name__ == '__main__':
1854     dut = DCache()
1855     vl = rtlil.convert(dut, ports=[])
1856     with open("test_dcache.il", "w") as f:
1857         f.write(vl)