src/soc/experiment/dcache.py

   1 #!/usr/bin/env python3
   2 #
   3 # Copyright (C) 2020,2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
   4 # Copyright (C) 2020 Cole Poirier
   5 # Copyright (C) 2020,2021 Cesar Strauss
   6 # Copyright (C) 2021 Tobias Platen
   7 #
   8 # Original dcache.vhdl Copyright of its authors and licensed
   9 # by IBM under CC-BY 4.0
  10 # https://github.com/antonblanchard/microwatt
  11 #
  12 # Conversion to nmigen funded by NLnet and NGI POINTER under EU Grants
  13 # 871528 and 957073, under the LGPL-v3+ License
  14
  15 """DCache
  16
  17 based on Anton Blanchard microwatt dcache.vhdl
  18
  19 note that the microwatt dcache wishbone interface expects "stall".
  20 for simplicity at the moment this is hard-coded to cyc & ~ack.
  21 see WB4 spec, p84, section 5.2.1
  22
  23 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  24 is raised.  sigh
  25
  26 Links:
  27
  28 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  29 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  30 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  31   (discussion about brams for ECP5)
  32
  33 """
  34
  35 import sys
  36
  37 from nmutil.gtkw import write_gtkw
  38
  39 sys.setrecursionlimit(1000000)
  40
  41 from enum import Enum, unique
  42
  43 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  44                     Record, Memory)
  45 from nmutil.util import Display
  46 from nmigen.lib.coding import Decoder
  47
  48 from copy import deepcopy
  49 from random import randint, seed
  50
  51 from nmigen_soc.wishbone.bus import Interface
  52
  53 from nmigen.cli import main
  54 from nmutil.iocontrol import RecordObject
  55 from nmigen.utils import log2_int
  56 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  57                                      DCacheToLoadStore1Type,
  58                                      MMUToDCacheType,
  59                                      DCacheToMMUType)
  60
  61 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  62                                 WBAddrType, WBDataType, WBSelType,
  63                                 WBMasterOut, WBSlaveOut,
  64                                 WBMasterOutVector, WBSlaveOutVector,
  65                                 WBIOMasterOut, WBIOSlaveOut)
  66
  67 from soc.experiment.cache_ram import CacheRam
  68 from soc.experiment.plru import PLRU, PLRUs
  69 #from nmutil.plru import PLRU, PLRUs
  70
  71 # for test
  72 from soc.bus.sram import SRAM
  73 from nmigen import Memory
  74 from nmigen.cli import rtlil
  75
  76 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  77 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  78 from nmutil.sim_tmp_alternative import Simulator
  79
  80 from nmutil.util import wrap
  81
  82
  83 # TODO: make these parameters of DCache at some point
  84 LINE_SIZE = 64    # Line size in bytes
  85 NUM_LINES = 64    # Number of lines in a set
  86 NUM_WAYS = 2      # Number of ways
  87 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  88 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  89 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  90 LOG_LENGTH = 0    # Non-zero to enable log data collection
  91
  92 # BRAM organisation: We never access more than
  93 #     -- WB_DATA_BITS at a time so to save
  94 #     -- resources we make the array only that wide, and
  95 #     -- use consecutive indices to make a cache "line"
  96 #     --
  97 #     -- ROW_SIZE is the width in bytes of the BRAM
  98 #     -- (based on WB, so 64-bits)
  99 ROW_SIZE = WB_DATA_BITS // 8;
 100
 101 # ROW_PER_LINE is the number of row (wishbone
 102 # transactions) in a line
 103 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
 104
 105 # BRAM_ROWS is the number of rows in BRAM needed
 106 # to represent the full dcache
 107 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
 108
 109 print ("ROW_SIZE", ROW_SIZE)
 110 print ("ROW_PER_LINE", ROW_PER_LINE)
 111 print ("BRAM_ROWS", BRAM_ROWS)
 112 print ("NUM_WAYS", NUM_WAYS)
 113
 114 # Bit fields counts in the address
 115
 116 # REAL_ADDR_BITS is the number of real address
 117 # bits that we store
 118 REAL_ADDR_BITS = 56
 119
 120 # ROW_BITS is the number of bits to select a row
 121 ROW_BITS = log2_int(BRAM_ROWS)
 122
 123 # ROW_LINE_BITS is the number of bits to select
 124 # a row within a line
 125 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 126
 127 # LINE_OFF_BITS is the number of bits for
 128 # the offset in a cache line
 129 LINE_OFF_BITS = log2_int(LINE_SIZE)
 130
 131 # ROW_OFF_BITS is the number of bits for
 132 # the offset in a row
 133 ROW_OFF_BITS = log2_int(ROW_SIZE)
 134
 135 # INDEX_BITS is the number if bits to
 136 # select a cache line
 137 INDEX_BITS = log2_int(NUM_LINES)
 138
 139 # SET_SIZE_BITS is the log base 2 of the set size
 140 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 141
 142 # TAG_BITS is the number of bits of
 143 # the tag part of the address
 144 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 145
 146 # TAG_WIDTH is the width in bits of each way of the tag RAM
 147 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 148
 149 # WAY_BITS is the number of bits to select a way
 150 WAY_BITS = log2_int(NUM_WAYS)
 151
 152 # Example of layout for 32 lines of 64 bytes:
 153 layout = f"""\
 154   DCache Layout:
 155  |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
 156   ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
 157   ..  tag    |index|  line  |
 158   ..         |   row   |    |
 159   ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
 160   ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
 161   ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
 162   ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
 163   ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
 164   .. --------|              | TAG_BITS      ({TAG_BITS})
 165 """
 166 print (layout)
 167 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 168             (TAG_BITS, INDEX_BITS, ROW_BITS,
 169              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 170 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 171 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 172 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 173
 174 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 175
 176 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 177 print ("    TAG_WIDTH", TAG_WIDTH)
 178 print ("     NUM_WAYS", NUM_WAYS)
 179 print ("    NUM_LINES", NUM_LINES)
 180
 181
 182 def CacheTagArray():
 183     return Array(Signal(TAG_RAM_WIDTH, name="tag%d" % x) \
 184                    for x in range(NUM_LINES))
 185
 186 def CacheValidsArray():
 187     return Array(Signal(NUM_WAYS, name="tag_valids%d" % x)
 188                  for x in range(NUM_LINES))
 189
 190 def RowPerLineValidArray():
 191     return Array(Signal(name="rows_valid%d" % x) \
 192                         for x in range(ROW_PER_LINE))
 193
 194
 195 # L1 TLB
 196 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 197 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 198 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 199 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 200 TLB_PTE_BITS     = 64
 201 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 202
 203 def ispow2(x):
 204     return (1<<log2_int(x, False)) == x
 205
 206 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 207 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 208 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 209 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 210 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 211 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 212         "geometry bits don't add up"
 213 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 214         "geometry bits don't add up"
 215 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 216          "geometry bits don't add up"
 217 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 218 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 219
 220
 221 def TLBHit(name):
 222     return Record([('valid', 1),
 223                    ('way', TLB_WAY_BITS)], name=name)
 224
 225 def TLBTagEAArray():
 226     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 227                 for x in range (TLB_NUM_WAYS))
 228
 229 def TLBRecord(name):
 230     tlb_layout = [('valid', TLB_NUM_WAYS),
 231                   ('tag', TLB_TAG_WAY_BITS),
 232                   ('pte', TLB_PTE_WAY_BITS)
 233                  ]
 234     return Record(tlb_layout, name=name)
 235
 236 def TLBValidArray():
 237     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 238                         for x in range(TLB_SET_SIZE))
 239
 240 def HitWaySet():
 241     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 242                         for x in range(TLB_NUM_WAYS))
 243
 244 # Cache RAM interface
 245 def CacheRamOut():
 246     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 247                  for x in range(NUM_WAYS))
 248
 249 # PLRU output interface
 250 def PLRUOut():
 251     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 252                 for x in range(NUM_LINES))
 253
 254 # TLB PLRU output interface
 255 def TLBPLRUOut():
 256     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 257                 for x in range(TLB_SET_SIZE))
 258
 259 # Helper functions to decode incoming requests
 260 #
 261 # Return the cache line index (tag index) for an address
 262 def get_index(addr):
 263     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 264
 265 # Return the cache row index (data memory) for an address
 266 def get_row(addr):
 267     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 268
 269 # Return the index of a row within a line
 270 def get_row_of_line(row):
 271     return row[:ROW_BITS][:ROW_LINE_BITS]
 272
 273 # Returns whether this is the last row of a line
 274 def is_last_row_addr(addr, last):
 275     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 276
 277 # Returns whether this is the last row of a line
 278 def is_last_row(row, last):
 279     return get_row_of_line(row) == last
 280
 281 # Return the next row in the current cache line. We use a
 282 # dedicated function in order to limit the size of the
 283 # generated adder to be only the bits within a cache line
 284 # (3 bits with default settings)
 285 def next_row(row):
 286     row_v = row[0:ROW_LINE_BITS] + 1
 287     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 288
 289 # Get the tag value from the address
 290 def get_tag(addr):
 291     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 292
 293 # Read a tag from a tag memory row
 294 def read_tag(way, tagset):
 295     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 296
 297 # Read a TLB tag from a TLB tag memory row
 298 def read_tlb_tag(way, tags):
 299     return tags.word_select(way, TLB_EA_TAG_BITS)
 300
 301 # Write a TLB tag to a TLB tag memory row
 302 def write_tlb_tag(way, tags, tag):
 303     return read_tlb_tag(way, tags).eq(tag)
 304
 305 # Read a PTE from a TLB PTE memory row
 306 def read_tlb_pte(way, ptes):
 307     return ptes.word_select(way, TLB_PTE_BITS)
 308
 309 def write_tlb_pte(way, ptes, newpte):
 310     return read_tlb_pte(way, ptes).eq(newpte)
 311
 312
 313 # Record for storing permission, attribute, etc. bits from a PTE
 314 class PermAttr(RecordObject):
 315     def __init__(self, name=None):
 316         super().__init__(name=name)
 317         self.reference = Signal()
 318         self.changed   = Signal()
 319         self.nocache   = Signal()
 320         self.priv      = Signal()
 321         self.rd_perm   = Signal()
 322         self.wr_perm   = Signal()
 323
 324
 325 def extract_perm_attr(pte):
 326     pa = PermAttr()
 327     return pa;
 328
 329
 330 # Type of operation on a "valid" input
 331 @unique
 332 class Op(Enum):
 333     OP_NONE       = 0
 334     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 335     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 336     OP_LOAD_HIT   = 3 # Cache hit on load
 337     OP_LOAD_MISS  = 4 # Load missing cache
 338     OP_LOAD_NC    = 5 # Non-cachable load
 339     OP_STORE_HIT  = 6 # Store hitting cache
 340     OP_STORE_MISS = 7 # Store missing cache
 341
 342
 343 # Cache state machine
 344 @unique
 345 class State(Enum):
 346     IDLE             = 0 # Normal load hit processing
 347     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 348     STORE_WAIT_ACK   = 2 # Store wait ack
 349     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 350
 351
 352 # Dcache operations:
 353 #
 354 # In order to make timing, we use the BRAMs with
 355 # an output buffer, which means that the BRAM
 356 # output is delayed by an extra cycle.
 357 #
 358 # Thus, the dcache has a 2-stage internal pipeline
 359 # for cache hits with no stalls.
 360 #
 361 # All other operations are handled via stalling
 362 # in the first stage.
 363 #
 364 # The second stage can thus complete a hit at the same
 365 # time as the first stage emits a stall for a complex op.
 366 #
 367 # Stage 0 register, basically contains just the latched request
 368
 369 class RegStage0(RecordObject):
 370     def __init__(self, name=None):
 371         super().__init__(name=name)
 372         self.req     = LoadStore1ToDCacheType(name="lsmem")
 373         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 374         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 375         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 376         self.mmu_req = Signal() # indicates source of request
 377         self.d_valid = Signal() # indicates req.data is valid now
 378
 379
 380 class MemAccessRequest(RecordObject):
 381     def __init__(self, name=None):
 382         super().__init__(name=name)
 383         self.op        = Signal(Op)
 384         self.valid     = Signal()
 385         self.dcbz      = Signal()
 386         self.real_addr = Signal(REAL_ADDR_BITS)
 387         self.data      = Signal(64)
 388         self.byte_sel  = Signal(8)
 389         self.hit_way   = Signal(WAY_BITS)
 390         self.same_tag  = Signal()
 391         self.mmu_req   = Signal()
 392
 393
 394 # First stage register, contains state for stage 1 of load hits
 395 # and for the state machine used by all other operations
 396 class RegStage1(RecordObject):
 397     def __init__(self, name=None):
 398         super().__init__(name=name)
 399         # Info about the request
 400         self.full             = Signal() # have uncompleted request
 401         self.mmu_req          = Signal() # request is from MMU
 402         self.req              = MemAccessRequest(name="reqmem")
 403
 404         # Cache hit state
 405         self.hit_way          = Signal(WAY_BITS)
 406         self.hit_load_valid   = Signal()
 407         self.hit_index        = Signal(INDEX_BITS)
 408         self.cache_hit        = Signal()
 409
 410         # TLB hit state
 411         self.tlb_hit          = TLBHit("tlb_hit")
 412         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 413
 414         # 2-stage data buffer for data forwarded from writes to reads
 415         self.forward_data1    = Signal(64)
 416         self.forward_data2    = Signal(64)
 417         self.forward_sel1     = Signal(8)
 418         self.forward_valid1   = Signal()
 419         self.forward_way1     = Signal(WAY_BITS)
 420         self.forward_row1     = Signal(ROW_BITS)
 421         self.use_forward1     = Signal()
 422         self.forward_sel      = Signal(8)
 423
 424         # Cache miss state (reload state machine)
 425         self.state            = Signal(State)
 426         self.dcbz             = Signal()
 427         self.write_bram       = Signal()
 428         self.write_tag        = Signal()
 429         self.slow_valid       = Signal()
 430         self.wb               = WBMasterOut("wb")
 431         self.reload_tag       = Signal(TAG_BITS)
 432         self.store_way        = Signal(WAY_BITS)
 433         self.store_row        = Signal(ROW_BITS)
 434         self.store_index      = Signal(INDEX_BITS)
 435         self.end_row_ix       = Signal(ROW_LINE_BITS)
 436         self.rows_valid       = RowPerLineValidArray()
 437         self.acks_pending     = Signal(3)
 438         self.inc_acks         = Signal()
 439         self.dec_acks         = Signal()
 440
 441         # Signals to complete (possibly with error)
 442         self.ls_valid         = Signal()
 443         self.ls_error         = Signal()
 444         self.mmu_done         = Signal()
 445         self.mmu_error        = Signal()
 446         self.cache_paradox    = Signal()
 447
 448         # Signal to complete a failed stcx.
 449         self.stcx_fail        = Signal()
 450
 451
 452 # Reservation information
 453 class Reservation(RecordObject):
 454     def __init__(self, name=None):
 455         super().__init__(name=name)
 456         self.valid = Signal()
 457         self.addr  = Signal(64-LINE_OFF_BITS)
 458
 459
 460 class DTLBUpdate(Elaboratable):
 461     def __init__(self):
 462         self.tlbie    = Signal()
 463         self.tlbwe    = Signal()
 464         self.doall    = Signal()
 465         self.tlb_hit     = TLBHit("tlb_hit")
 466         self.tlb_req_index = Signal(TLB_SET_BITS)
 467
 468         self.repl_way        = Signal(TLB_WAY_BITS)
 469         self.eatag           = Signal(TLB_EA_TAG_BITS)
 470         self.pte_data        = Signal(TLB_PTE_BITS)
 471
 472         # read from dtlb array
 473         self.tlb_read       = Signal()
 474         self.tlb_read_index = Signal(TLB_SET_BITS)
 475         self.tlb_way        = TLBRecord("o_tlb_way")
 476
 477     def elaborate(self, platform):
 478         m = Module()
 479         comb = m.d.comb
 480         sync = m.d.sync
 481
 482         # there are 3 parts to this:
 483         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 484         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 485         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 486         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 487         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 488         # hmmm....
 489
 490         dtlb_valid = TLBValidArray()
 491         tlb_req_index = self.tlb_req_index
 492
 493         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 494         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 495         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 496         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 497         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 498         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 499
 500         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 501         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 502         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 503         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 504                                     granularity=TLB_EA_TAG_BITS)
 505
 506         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 507         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 508         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 509                                     granularity=TLB_PTE_BITS)
 510
 511         # commented out for now, can be put in if Memory.reset can be
 512         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 513         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 514         #m.submodules.rd_valid = rd_valid = validm.read_port()
 515         #m.submodules.wr_valid = wr_valid = validm.write_port(
 516                                     #granularity=1)
 517
 518         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 519         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 520         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 521         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 522         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 523         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 524         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 525
 526         updated  = Signal()
 527         v_updated  = Signal()
 528         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 529         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 530         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 531         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 532
 533         comb += dv.eq(dtlb_valid[tlb_req_index])
 534         comb += db_out.eq(dv)
 535
 536         with m.If(self.tlbie & self.doall):
 537             # clear all valid bits at once
 538             # XXX hmmm, validm _could_ use Memory reset here...
 539             for i in range(TLB_SET_SIZE):
 540                 sync += dtlb_valid[i].eq(0)
 541         with m.Elif(self.tlbie):
 542             # invalidate just the hit_way
 543             with m.If(self.tlb_hit.valid):
 544                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 545                 comb += v_updated.eq(1)
 546         with m.Elif(self.tlbwe):
 547             # write to the requested tag and PTE
 548             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 549             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 550             # set valid bit
 551             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 552
 553             comb += updated.eq(1)
 554             comb += v_updated.eq(1)
 555
 556         # above, sometimes valid is requested to be updated but data not
 557         # therefore split them out, here.  note the granularity thing matches
 558         # with the shift-up of the eatag/pte_data into the correct TLB way.
 559         # thus is it not necessary to write the entire lot, just the portion
 560         # being altered: hence writing the *old* copy of the row is not needed
 561         with m.If(updated): # PTE and TAG to be written
 562             comb += wr_pteway.data.eq(pb_out)
 563             comb += wr_pteway.en.eq(1<<self.repl_way)
 564             comb += wr_tagway.data.eq(tb_out)
 565             comb += wr_tagway.en.eq(1<<self.repl_way)
 566         with m.If(v_updated): # Valid to be written
 567             sync += dtlb_valid[tlb_req_index].eq(db_out)
 568             #comb += wr_valid.data.eq(db_out)
 569             #comb += wr_valid.en.eq(1<<self.repl_way)
 570
 571         # select one TLB way, use a register here
 572         r_delay = Signal()
 573         sync += r_delay.eq(self.tlb_read)
 574         # first deal with the valids, which are not in a Memory.
 575         # tlb way valid is output on a 1 clock delay with sync,
 576         # but have to explicitly deal with "forwarding" here
 577         with m.If(self.tlb_read):
 578             with m.If(v_updated): # write *and* read in same cycle: forward
 579                 sync += self.tlb_way.valid.eq(db_out)
 580             with m.Else():
 581                 sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 582         # now deal with the Memory-read case. the output must remain
 583         # valid (stable) even when a read-request is not made, but stable
 584         # on a one-clock delay, hence the register
 585         r_tlb_way        = TLBRecord("r_tlb_way")
 586         with m.If(r_delay):
 587             # on one clock delay, capture the contents of the read port(s)
 588             comb += self.tlb_way.tag.eq(rd_tagway.data)
 589             comb += self.tlb_way.pte.eq(rd_pteway.data)
 590             sync += r_tlb_way.tag.eq(rd_tagway.data)
 591             sync += r_tlb_way.pte.eq(rd_pteway.data)
 592         with m.Else():
 593             # ... so that the register can output it when no read is requested
 594             # it's rather overkill but better to be safe than sorry
 595             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 596             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 597             #comb += self.tlb_way.eq(r_tlb_way)
 598
 599         return m
 600
 601
 602 class DCachePendingHit(Elaboratable):
 603
 604     def __init__(self, tlb_way,
 605                       cache_i_validdx, cache_tag_set,
 606                     req_addr):
 607
 608         self.go          = Signal()
 609         self.virt_mode   = Signal()
 610         self.is_hit      = Signal()
 611         self.tlb_hit      = TLBHit("tlb_hit")
 612         self.hit_way     = Signal(WAY_BITS)
 613         self.rel_match   = Signal()
 614         self.req_index   = Signal(INDEX_BITS)
 615         self.reload_tag  = Signal(TAG_BITS)
 616
 617         self.tlb_way = tlb_way
 618         self.cache_i_validdx = cache_i_validdx
 619         self.cache_tag_set = cache_tag_set
 620         self.req_addr = req_addr
 621
 622     def elaborate(self, platform):
 623         m = Module()
 624         comb = m.d.comb
 625         sync = m.d.sync
 626
 627         go = self.go
 628         virt_mode = self.virt_mode
 629         is_hit = self.is_hit
 630         tlb_way = self.tlb_way
 631         cache_i_validdx = self.cache_i_validdx
 632         cache_tag_set = self.cache_tag_set
 633         req_addr = self.req_addr
 634         tlb_hit = self.tlb_hit
 635         hit_way = self.hit_way
 636         rel_match = self.rel_match
 637         req_index = self.req_index
 638         reload_tag = self.reload_tag
 639
 640         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 641                                   for i in range(TLB_NUM_WAYS))
 642         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 643                                     for i in range(TLB_NUM_WAYS))
 644         hit_way_set = HitWaySet()
 645
 646         # Test if pending request is a hit on any way
 647         # In order to make timing in virtual mode,
 648         # when we are using the TLB, we compare each
 649         # way with each of the real addresses from each way of
 650         # the TLB, and then decide later which match to use.
 651
 652         with m.If(virt_mode):
 653             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 654                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 655                 s_hit       = Signal(name="s_hit%d" % j)
 656                 s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
 657                 s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
 658                 # read the PTE, calc the Real Address, get tge tag
 659                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 660                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 661                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 662                 comb += s_tag.eq(get_tag(s_ra))
 663                 # for each way check tge tag against the cache tag set
 664                 for i in range(NUM_WAYS): # way_t
 665                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 666                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 667                                   (read_tag(i, cache_tag_set) == s_tag)
 668                                   & (tlb_way.valid[j]))
 669                     with m.If(is_tag_hit):
 670                         comb += hit_way_set[j].eq(i)
 671                         comb += s_hit.eq(1)
 672                 comb += hit_set[j].eq(s_hit)
 673                 comb += rel_matches[j].eq(s_tag == reload_tag)
 674             with m.If(tlb_hit.valid):
 675                 comb += is_hit.eq(hit_set[tlb_hit.way])
 676                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 677                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 678         with m.Else():
 679             s_tag       = Signal(TAG_BITS)
 680             comb += s_tag.eq(get_tag(req_addr))
 681             for i in range(NUM_WAYS): # way_t
 682                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 683                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 684                           (read_tag(i, cache_tag_set) == s_tag))
 685                 with m.If(is_tag_hit):
 686                     comb += hit_way.eq(i)
 687                     comb += is_hit.eq(1)
 688             with m.If(s_tag == reload_tag):
 689                 comb += rel_match.eq(1)
 690
 691         return m
 692
 693
 694 class DCache(Elaboratable):
 695     """Set associative dcache write-through
 696
 697     TODO (in no specific order):
 698     * See list in icache.vhdl
 699     * Complete load misses on the cycle when WB data comes instead of
 700       at the end of line (this requires dealing with requests coming in
 701       while not idle...)
 702     """
 703     def __init__(self, pspec=None):
 704         self.d_in      = LoadStore1ToDCacheType("d_in")
 705         self.d_out     = DCacheToLoadStore1Type("d_out")
 706
 707         self.m_in      = MMUToDCacheType("m_in")
 708         self.m_out     = DCacheToMMUType("m_out")
 709
 710         self.stall_out = Signal()
 711         self.any_stall_out = Signal()
 712         self.dreq_when_stall = Signal()
 713         self.mreq_when_stall = Signal()
 714
 715         # standard naming (wired to non-standard for compatibility)
 716         self.bus = Interface(addr_width=32,
 717                             data_width=64,
 718                             granularity=8,
 719                             features={'stall'},
 720                             alignment=0,
 721                             name="dcache")
 722
 723         self.log_out   = Signal(20)
 724
 725         # test if microwatt compatibility is to be enabled
 726         self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
 727                                  (pspec.microwatt_compat == True))
 728
 729     def stage_0(self, m, r0, r1, r0_full):
 730         """Latch the request in r0.req as long as we're not stalling
 731         """
 732         comb = m.d.comb
 733         sync = m.d.sync
 734         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 735
 736         r = RegStage0("stage0")
 737
 738         # TODO, this goes in unit tests and formal proofs
 739         with m.If(d_in.valid & m_in.valid):
 740             sync += Display("request collision loadstore vs MMU")
 741
 742         with m.If(m_in.valid):
 743             comb += r.req.valid.eq(1)
 744             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 745             comb += r.req.dcbz.eq(0)
 746             comb += r.req.nc.eq(0)
 747             comb += r.req.reserve.eq(0)
 748             comb += r.req.virt_mode.eq(0)
 749             comb += r.req.priv_mode.eq(1)
 750             comb += r.req.addr.eq(m_in.addr)
 751             comb += r.req.data.eq(m_in.pte)
 752             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 753             comb += r.tlbie.eq(m_in.tlbie)
 754             comb += r.doall.eq(m_in.doall)
 755             comb += r.tlbld.eq(m_in.tlbld)
 756             comb += r.mmu_req.eq(1)
 757             comb += r.d_valid.eq(1)
 758             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 759                                  m_in.addr, m_in.pte, r.req.load)
 760
 761         with m.Else():
 762             comb += r.req.eq(d_in)
 763             comb += r.req.data.eq(0)
 764             comb += r.tlbie.eq(0)
 765             comb += r.doall.eq(0)
 766             comb += r.tlbld.eq(0)
 767             comb += r.mmu_req.eq(0)
 768             comb += r.d_valid.eq(0)
 769
 770         sync += r0_full.eq(0)
 771         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 772             sync += r0.eq(r)
 773             sync += r0_full.eq(r.req.valid)
 774         with m.Elif(~r0.d_valid):
 775             # Sample data the cycle after a request comes in from loadstore1.
 776             # If another request has come in already then the data will get
 777             # put directly into req.data below.
 778             sync += r0.req.data.eq(d_in.data)
 779             sync += r0.d_valid.eq(1)
 780         with m.If(d_in.valid):
 781             m.d.sync += Display("    DCACHE req cache "
 782                                 "virt %d addr %x data %x ld %d",
 783                                  r.req.virt_mode, r.req.addr,
 784                                  r.req.data, r.req.load)
 785
 786     def tlb_read(self, m, r0_stall, tlb_way):
 787         """TLB
 788         Operates in the second cycle on the request latched in r0.req.
 789         TLB updates write the entry at the end of the second cycle.
 790         """
 791         comb = m.d.comb
 792         sync = m.d.sync
 793         m_in, d_in = self.m_in, self.d_in
 794
 795         addrbits = Signal(TLB_SET_BITS)
 796
 797         amin = TLB_LG_PGSZ
 798         amax = TLB_LG_PGSZ + TLB_SET_BITS
 799
 800         with m.If(m_in.valid):
 801             comb += addrbits.eq(m_in.addr[amin : amax])
 802         with m.Else():
 803             comb += addrbits.eq(d_in.addr[amin : amax])
 804
 805         # If we have any op and the previous op isn't finished,
 806         # then keep the same output for next cycle.
 807         d = self.dtlb_update
 808         comb += d.tlb_read_index.eq(addrbits)
 809         comb += d.tlb_read.eq(~r0_stall)
 810         comb += tlb_way.eq(d.tlb_way)
 811
 812     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 813         """Generate TLB PLRUs
 814         """
 815         comb = m.d.comb
 816         sync = m.d.sync
 817
 818         if TLB_NUM_WAYS == 0:
 819             return
 820
 821         # suite of PLRUs with a selection and output mechanism
 822         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 823         m.submodules.tlb_plrus = tlb_plrus
 824         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 825         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 826         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 827         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 828         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 829
 830     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 831                    tlb_way,
 832                    pte, tlb_hit, valid_ra, perm_attr, ra):
 833
 834         comb = m.d.comb
 835
 836         hitway = Signal(TLB_WAY_BITS)
 837         hit    = Signal()
 838         eatag  = Signal(TLB_EA_TAG_BITS)
 839
 840         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 841         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 842         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 843
 844         for i in range(TLB_NUM_WAYS):
 845             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 846             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 847             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 848             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 849             with m.If(is_tag_hit):
 850                 comb += hitway.eq(i)
 851                 comb += hit.eq(1)
 852
 853         comb += tlb_hit.valid.eq(hit & r0_valid)
 854         comb += tlb_hit.way.eq(hitway)
 855
 856         with m.If(tlb_hit.valid):
 857             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 858         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 859
 860         with m.If(r0.req.virt_mode):
 861             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 862                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 863                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 864             comb += perm_attr.reference.eq(pte[8])
 865             comb += perm_attr.changed.eq(pte[7])
 866             comb += perm_attr.nocache.eq(pte[5])
 867             comb += perm_attr.priv.eq(pte[3])
 868             comb += perm_attr.rd_perm.eq(pte[2])
 869             comb += perm_attr.wr_perm.eq(pte[1])
 870         with m.Else():
 871             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 872                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 873             comb += perm_attr.reference.eq(1)
 874             comb += perm_attr.changed.eq(1)
 875             comb += perm_attr.nocache.eq(0)
 876             comb += perm_attr.priv.eq(1)
 877             comb += perm_attr.rd_perm.eq(1)
 878             comb += perm_attr.wr_perm.eq(1)
 879
 880         with m.If(valid_ra):
 881             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 882                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 883             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 884             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 885             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 886             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 887             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 888             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 889
 890     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 891                     tlb_hit, tlb_plru_victim):
 892
 893         comb = m.d.comb
 894         sync = m.d.sync
 895
 896         tlbie    = Signal()
 897         tlbwe    = Signal()
 898
 899         comb += tlbie.eq(r0_valid & r0.tlbie)
 900         comb += tlbwe.eq(r0_valid & r0.tlbld)
 901
 902         d = self.dtlb_update
 903
 904         comb += d.tlbie.eq(tlbie)
 905         comb += d.tlbwe.eq(tlbwe)
 906         comb += d.doall.eq(r0.doall)
 907         comb += d.tlb_hit.eq(tlb_hit)
 908         comb += d.tlb_req_index.eq(tlb_req_index)
 909
 910         with m.If(tlb_hit.valid):
 911             comb += d.repl_way.eq(tlb_hit.way)
 912         with m.Else():
 913             comb += d.repl_way.eq(tlb_plru_victim)
 914         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 915         comb += d.pte_data.eq(r0.req.data)
 916
 917     def maybe_plrus(self, m, r1, plru_victim):
 918         """Generate PLRUs
 919         """
 920         comb = m.d.comb
 921         sync = m.d.sync
 922
 923         if TLB_NUM_WAYS == 0:
 924             return
 925
 926         # suite of PLRUs with a selection and output mechanism
 927         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 928         comb += plrus.way.eq(r1.hit_way)
 929         comb += plrus.valid.eq(r1.cache_hit)
 930         comb += plrus.index.eq(r1.hit_index)
 931         comb += plrus.isel.eq(r1.store_index) # select victim
 932         comb += plru_victim.eq(plrus.o_index) # selected victim
 933
 934     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set):
 935         """Cache tag RAM read port
 936         """
 937         comb = m.d.comb
 938         sync = m.d.sync
 939
 940         m_in, d_in = self.m_in, self.d_in
 941
 942         # synchronous tag read-port
 943         m.submodules.rd_tag = rd_tag = self.tagmem.read_port()
 944
 945         index = Signal(INDEX_BITS)
 946
 947         with m.If(r0_stall):
 948             comb += index.eq(req_index)
 949         with m.Elif(m_in.valid):
 950             comb += index.eq(get_index(m_in.addr))
 951         with m.Else():
 952             comb += index.eq(get_index(d_in.addr))
 953         comb += rd_tag.addr.eq(index)
 954         comb += cache_tag_set.eq(rd_tag.data) # read-port is a 1-clock delay
 955
 956     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 957                        r0_valid, r1, cache_valids, replace_way,
 958                        use_forward1_next, use_forward2_next,
 959                        req_hit_way, plru_victim, rc_ok, perm_attr,
 960                        valid_ra, perm_ok, access_ok, req_op, req_go,
 961                        tlb_hit, tlb_way, cache_tag_set,
 962                        cancel_store, req_same_tag, r0_stall, early_req_row):
 963         """Cache request parsing and hit detection
 964         """
 965
 966         comb = m.d.comb
 967         m_in, d_in = self.m_in, self.d_in
 968
 969         is_hit      = Signal()
 970         hit_way     = Signal(WAY_BITS)
 971         op          = Signal(Op)
 972         opsel       = Signal(3)
 973         go          = Signal()
 974         nc          = Signal()
 975         cache_i_validdx = Signal(NUM_WAYS)
 976
 977         # Extract line, row and tag from request
 978         comb += req_index.eq(get_index(r0.req.addr))
 979         comb += req_row.eq(get_row(r0.req.addr))
 980         comb += req_tag.eq(get_tag(ra))
 981
 982         if False: # display on comb is a bit... busy.
 983             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 984                     r0.req.addr, ra, req_index, req_tag, req_row)
 985
 986         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 987         comb += cache_i_validdx.eq(cache_valids[req_index])
 988
 989         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 990                                             cache_i_validdx, cache_tag_set,
 991                                             r0.req.addr)
 992         comb += dc.tlb_hit.eq(tlb_hit)
 993         comb += dc.reload_tag.eq(r1.reload_tag)
 994         comb += dc.virt_mode.eq(r0.req.virt_mode)
 995         comb += dc.go.eq(go)
 996         comb += dc.req_index.eq(req_index)
 997
 998         comb += is_hit.eq(dc.is_hit)
 999         comb += hit_way.eq(dc.hit_way)
1000         comb += req_same_tag.eq(dc.rel_match)
1001
1002         # See if the request matches the line currently being reloaded
1003         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
1004                   (req_index == r1.store_index) & req_same_tag):
1005             # For a store, consider this a hit even if the row isn't
1006             # valid since it will be by the time we perform the store.
1007             # For a load, check the appropriate row valid bit.
1008             rrow = Signal(ROW_LINE_BITS)
1009             comb += rrow.eq(req_row)
1010             valid = r1.rows_valid[rrow]
1011             comb += is_hit.eq((~r0.req.load) | valid)
1012             comb += hit_way.eq(replace_way)
1013
1014         # Whether to use forwarded data for a load or not
1015         with m.If((get_row(r1.req.real_addr) == req_row) &
1016                   (r1.req.hit_way == hit_way)):
1017             # Only need to consider r1.write_bram here, since if we
1018             # are writing refill data here, then we don't have a
1019             # cache hit this cycle on the line being refilled.
1020             # (There is the possibility that the load following the
1021             # load miss that started the refill could be to the old
1022             # contents of the victim line, since it is a couple of
1023             # cycles after the refill starts before we see the updated
1024             # cache tag. In that case we don't use the bypass.)
1025             comb += use_forward1_next.eq(r1.write_bram)
1026         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
1027             comb += use_forward2_next.eq(r1.forward_valid1)
1028
1029         # The way that matched on a hit
1030         comb += req_hit_way.eq(hit_way)
1031
1032         # The way to replace on a miss
1033         with m.If(r1.write_tag):
1034             comb += replace_way.eq(plru_victim)
1035         with m.Else():
1036             comb += replace_way.eq(r1.store_way)
1037
1038         # work out whether we have permission for this access
1039         # NB we don't yet implement AMR, thus no KUAP
1040         comb += rc_ok.eq(perm_attr.reference
1041                          & (r0.req.load | perm_attr.changed))
1042         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1043                            (perm_attr.wr_perm |
1044                               (r0.req.load & perm_attr.rd_perm)))
1045         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1046
1047         # Combine the request and cache hit status to decide what
1048         # operation needs to be done
1049         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1050         comb += op.eq(Op.OP_NONE)
1051         with m.If(go):
1052             with m.If(~access_ok):
1053                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1054                                  valid_ra, perm_ok, rc_ok)
1055                 comb += op.eq(Op.OP_BAD)
1056             with m.Elif(cancel_store):
1057                 m.d.sync += Display("DCACHE cancel store")
1058                 comb += op.eq(Op.OP_STCX_FAIL)
1059             with m.Else():
1060                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1061                                  valid_ra, nc, r0.req.load)
1062                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1063                 with m.Switch(opsel):
1064                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1065                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1066                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1067                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1068                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1069                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1070                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1071                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1072         comb += req_op.eq(op)
1073         comb += req_go.eq(go)
1074
1075         # Version of the row number that is valid one cycle earlier
1076         # in the cases where we need to read the cache data BRAM.
1077         # If we're stalling then we need to keep reading the last
1078         # row requested.
1079         with m.If(~r0_stall):
1080             with m.If(m_in.valid):
1081                 comb += early_req_row.eq(get_row(m_in.addr))
1082             with m.Else():
1083                 comb += early_req_row.eq(get_row(d_in.addr))
1084         with m.Else():
1085             comb += early_req_row.eq(req_row)
1086
1087     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1088                          r0_valid, r0, reservation):
1089         """Handle load-with-reservation and store-conditional instructions
1090         """
1091         comb = m.d.comb
1092
1093         with m.If(r0_valid & r0.req.reserve):
1094             # XXX generate alignment interrupt if address
1095             # is not aligned XXX or if r0.req.nc = '1'
1096             with m.If(r0.req.load):
1097                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1098             with m.Else():
1099                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1100                 with m.If((~reservation.valid) |
1101                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1102                     comb += cancel_store.eq(1)
1103
1104     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1105                         reservation, r0):
1106         comb = m.d.comb
1107         sync = m.d.sync
1108
1109         with m.If(r0_valid & access_ok):
1110             with m.If(clear_rsrv):
1111                 sync += reservation.valid.eq(0)
1112             with m.Elif(set_rsrv):
1113                 sync += reservation.valid.eq(1)
1114                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1115
1116     def writeback_control(self, m, r1, cache_out_row):
1117         """Return data for loads & completion control logic
1118         """
1119         comb = m.d.comb
1120         sync = m.d.sync
1121         d_out, m_out = self.d_out, self.m_out
1122
1123         data_out = Signal(64)
1124         data_fwd = Signal(64)
1125
1126         # Use the bypass if are reading the row that was
1127         # written 1 or 2 cycles ago, including for the
1128         # slow_valid = 1 case (i.e. completing a load
1129         # miss or a non-cacheable load).
1130         with m.If(r1.use_forward1):
1131             comb += data_fwd.eq(r1.forward_data1)
1132         with m.Else():
1133             comb += data_fwd.eq(r1.forward_data2)
1134
1135         comb += data_out.eq(cache_out_row)
1136
1137         for i in range(8):
1138             with m.If(r1.forward_sel[i]):
1139                 dsel = data_fwd.word_select(i, 8)
1140                 comb += data_out.word_select(i, 8).eq(dsel)
1141
1142         # DCache output to LoadStore
1143         comb += d_out.valid.eq(r1.ls_valid)
1144         comb += d_out.data.eq(data_out)
1145         comb += d_out.store_done.eq(~r1.stcx_fail)
1146         comb += d_out.error.eq(r1.ls_error)
1147         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1148
1149         # Outputs to MMU
1150         comb += m_out.done.eq(r1.mmu_done)
1151         comb += m_out.err.eq(r1.mmu_error)
1152         comb += m_out.data.eq(data_out)
1153
1154         # We have a valid load or store hit or we just completed
1155         # a slow op such as a load miss, a NC load or a store
1156         #
1157         # Note: the load hit is delayed by one cycle. However it
1158         # can still not collide with r.slow_valid (well unless I
1159         # miscalculated) because slow_valid can only be set on a
1160         # subsequent request and not on its first cycle (the state
1161         # machine must have advanced), which makes slow_valid
1162         # at least 2 cycles from the previous hit_load_valid.
1163
1164         # Sanity: Only one of these must be set in any given cycle
1165
1166         if False: # TODO: need Display to get this to work
1167             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1168             "unexpected slow_valid collision with stcx_fail"
1169
1170             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1171              "unexpected hit_load_delayed collision with slow_valid"
1172
1173         with m.If(~r1.mmu_req):
1174             # Request came from loadstore1...
1175             # Load hit case is the standard path
1176             with m.If(r1.hit_load_valid):
1177                 sync += Display("completing load hit data=%x", data_out)
1178
1179             # error cases complete without stalling
1180             with m.If(r1.ls_error):
1181                 with m.If(r1.dcbz):
1182                     sync += Display("completing dcbz with error")
1183                 with m.Else():
1184                     sync += Display("completing ld/st with error")
1185
1186             # Slow ops (load miss, NC, stores)
1187             with m.If(r1.slow_valid):
1188                 sync += Display("completing store or load miss adr=%x data=%x",
1189                                 r1.req.real_addr, data_out)
1190
1191         with m.Else():
1192             # Request came from MMU
1193             with m.If(r1.hit_load_valid):
1194                 sync += Display("completing load hit to MMU, data=%x",
1195                                 m_out.data)
1196             # error cases complete without stalling
1197             with m.If(r1.mmu_error):
1198                 sync += Display("combpleting MMU ld with error")
1199
1200             # Slow ops (i.e. load miss)
1201             with m.If(r1.slow_valid):
1202                 sync += Display("completing MMU load miss, adr=%x data=%x",
1203                                 r1.req.real_addr, m_out.data)
1204
1205     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1206         """rams
1207         Generate a cache RAM for each way. This handles the normal
1208         reads, writes from reloads and the special store-hit update
1209         path as well.
1210
1211         Note: the BRAMs have an extra read buffer, meaning the output
1212         is pipelined an extra cycle. This differs from the
1213         icache. The writeback logic needs to take that into
1214         account by using 1-cycle delayed signals for load hits.
1215         """
1216         comb = m.d.comb
1217         bus = self.bus
1218
1219         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1220         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1221         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1222         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1223                    ~r1.write_bram))
1224         comb += rwe.i.eq(replace_way)
1225
1226         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1227         comb += hwe.i.eq(r1.hit_way)
1228
1229         # this one is gated with write_bram, and replace_way_e can never be
1230         # set at the same time.  that means that do_write can OR the outputs
1231         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1232         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1233         comb += hre.i.eq(r1.req.hit_way)
1234
1235         # common Signals
1236         do_read  = Signal()
1237         wr_addr  = Signal(ROW_BITS)
1238         wr_data  = Signal(WB_DATA_BITS)
1239         wr_sel   = Signal(ROW_SIZE)
1240         rd_addr  = Signal(ROW_BITS)
1241
1242         comb += do_read.eq(1) # always enable
1243         comb += rd_addr.eq(early_req_row)
1244
1245         # Write mux:
1246         #
1247         # Defaults to wishbone read responses (cache refill)
1248         #
1249         # For timing, the mux on wr_data/sel/addr is not
1250         # dependent on anything other than the current state.
1251
1252         with m.If(r1.write_bram):
1253             # Write store data to BRAM.  This happens one
1254             # cycle after the store is in r0.
1255             comb += wr_data.eq(r1.req.data)
1256             comb += wr_sel.eq(r1.req.byte_sel)
1257             comb += wr_addr.eq(get_row(r1.req.real_addr))
1258
1259         with m.Else():
1260             # Otherwise, we might be doing a reload or a DCBZ
1261             with m.If(r1.dcbz):
1262                 comb += wr_data.eq(0)
1263             with m.Else():
1264                 comb += wr_data.eq(bus.dat_r)
1265             comb += wr_addr.eq(r1.store_row)
1266             comb += wr_sel.eq(~0) # all 1s
1267
1268         # set up Cache Rams
1269         for i in range(NUM_WAYS):
1270             do_write = Signal(name="do_wr%d" % i)
1271             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1272             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1273
1274             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1275             m.submodules["cacheram_%d" % i] = way
1276
1277             comb += way.rd_en.eq(do_read)
1278             comb += way.rd_addr.eq(rd_addr)
1279             comb += d_out.eq(way.rd_data_o)
1280             comb += way.wr_sel.eq(wr_sel_m)
1281             comb += way.wr_addr.eq(wr_addr)
1282             comb += way.wr_data.eq(wr_data)
1283
1284             # Cache hit reads
1285             with m.If(hwe.o[i]):
1286                 comb += cache_out_row.eq(d_out)
1287
1288             # these are mutually-exclusive via their Decoder-enablers
1289             # (note: Decoder-enable is inverted)
1290             comb += do_write.eq(hre.o[i] | rwe.o[i])
1291
1292             # Mask write selects with do_write since BRAM
1293             # doesn't have a global write-enable
1294             with m.If(do_write):
1295                 comb += wr_sel_m.eq(wr_sel)
1296
1297     # Cache hit synchronous machine for the easy case.
1298     # This handles load hits.
1299     # It also handles error cases (TLB miss, cache paradox)
1300     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1301                         req_hit_way, req_index, req_tag, access_ok,
1302                         tlb_hit, tlb_req_index):
1303         comb = m.d.comb
1304         sync = m.d.sync
1305
1306         with m.If(req_op != Op.OP_NONE):
1307             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1308                     req_op, r0.req.addr, r0.req.nc,
1309                     req_index, req_tag, req_hit_way)
1310
1311         with m.If(r0_valid):
1312             sync += r1.mmu_req.eq(r0.mmu_req)
1313
1314         # Fast path for load/store hits.
1315         # Set signals for the writeback controls.
1316         sync += r1.hit_way.eq(req_hit_way)
1317         sync += r1.hit_index.eq(req_index)
1318
1319         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1320         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1321                                 (req_op == Op.OP_STORE_HIT))
1322
1323         with m.If(req_op == Op.OP_BAD):
1324             sync += Display("Signalling ld/st error "
1325                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1326                             ~r0.mmu_req,r0.mmu_req,access_ok)
1327             sync += r1.ls_error.eq(~r0.mmu_req)
1328             sync += r1.mmu_error.eq(r0.mmu_req)
1329             sync += r1.cache_paradox.eq(access_ok)
1330         with m.Else():
1331             sync += r1.ls_error.eq(0)
1332             sync += r1.mmu_error.eq(0)
1333             sync += r1.cache_paradox.eq(0)
1334
1335         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1336
1337         # Record TLB hit information for updating TLB PLRU
1338         sync += r1.tlb_hit.eq(tlb_hit)
1339         sync += r1.tlb_hit_index.eq(tlb_req_index)
1340
1341     # Memory accesses are handled by this state machine:
1342     #
1343     #   * Cache load miss/reload (in conjunction with "rams")
1344     #   * Load hits for non-cachable forms
1345     #   * Stores (the collision case is handled in "rams")
1346     #
1347     # All wishbone requests generation is done here.
1348     # This machine operates at stage 1.
1349     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1350                     r0, replace_way,
1351                     req_hit_way, req_same_tag,
1352                     r0_valid, req_op, cache_valids, req_go, ra):
1353
1354         comb = m.d.comb
1355         sync = m.d.sync
1356         bus = self.bus
1357         d_in = self.d_in
1358
1359         m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
1360                                                     granularity=TAG_WIDTH)
1361
1362         req         = MemAccessRequest("mreq_ds")
1363
1364         r1_next_cycle = Signal()
1365         req_row = Signal(ROW_BITS)
1366         req_idx = Signal(INDEX_BITS)
1367         req_tag = Signal(TAG_BITS)
1368         comb += req_idx.eq(get_index(req.real_addr))
1369         comb += req_row.eq(get_row(req.real_addr))
1370         comb += req_tag.eq(get_tag(req.real_addr))
1371
1372         sync += r1.use_forward1.eq(use_forward1_next)
1373         sync += r1.forward_sel.eq(0)
1374
1375         with m.If(use_forward1_next):
1376             sync += r1.forward_sel.eq(r1.req.byte_sel)
1377         with m.Elif(use_forward2_next):
1378             sync += r1.forward_sel.eq(r1.forward_sel1)
1379
1380         sync += r1.forward_data2.eq(r1.forward_data1)
1381         with m.If(r1.write_bram):
1382             sync += r1.forward_data1.eq(r1.req.data)
1383             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1384             sync += r1.forward_way1.eq(r1.req.hit_way)
1385             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1386             sync += r1.forward_valid1.eq(1)
1387         with m.Else():
1388             with m.If(r1.dcbz):
1389                 sync += r1.forward_data1.eq(0)
1390             with m.Else():
1391                 sync += r1.forward_data1.eq(bus.dat_r)
1392             sync += r1.forward_sel1.eq(~0) # all 1s
1393             sync += r1.forward_way1.eq(replace_way)
1394             sync += r1.forward_row1.eq(r1.store_row)
1395             sync += r1.forward_valid1.eq(0)
1396
1397         # One cycle pulses reset
1398         sync += r1.slow_valid.eq(0)
1399         sync += r1.write_bram.eq(0)
1400         sync += r1.inc_acks.eq(0)
1401         sync += r1.dec_acks.eq(0)
1402
1403         sync += r1.ls_valid.eq(0)
1404         # complete tlbies and TLB loads in the third cycle
1405         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1406
1407         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1408             with m.If(r0.mmu_req):
1409                 sync += r1.mmu_done.eq(1)
1410             with m.Else():
1411                 sync += r1.ls_valid.eq(1)
1412
1413         with m.If(r1.write_tag):
1414             # Store new tag in selected way
1415             replace_way_onehot = Signal(NUM_WAYS)
1416             comb += replace_way_onehot.eq(1<<replace_way)
1417             ct = Signal(TAG_RAM_WIDTH)
1418             comb += ct.eq(r1.reload_tag << (replace_way*TAG_WIDTH))
1419             comb += wr_tag.en.eq(replace_way_onehot)
1420             comb += wr_tag.addr.eq(r1.store_index)
1421             comb += wr_tag.data.eq(ct)
1422
1423             sync += r1.store_way.eq(replace_way)
1424             sync += r1.write_tag.eq(0)
1425
1426         # Take request from r1.req if there is one there,
1427         # else from req_op, ra, etc.
1428         with m.If(r1.full):
1429             comb += req.eq(r1.req)
1430         with m.Else():
1431             comb += req.op.eq(req_op)
1432             comb += req.valid.eq(req_go)
1433             comb += req.mmu_req.eq(r0.mmu_req)
1434             comb += req.dcbz.eq(r0.req.dcbz)
1435             comb += req.real_addr.eq(ra)
1436
1437             with m.If(r0.req.dcbz):
1438                 # force data to 0 for dcbz
1439                 comb += req.data.eq(0)
1440             with m.Elif(r0.d_valid):
1441                 comb += req.data.eq(r0.req.data)
1442             with m.Else():
1443                 comb += req.data.eq(d_in.data)
1444
1445             # Select all bytes for dcbz
1446             # and for cacheable loads
1447             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1448                 comb += req.byte_sel.eq(~0) # all 1s
1449             with m.Else():
1450                 comb += req.byte_sel.eq(r0.req.byte_sel)
1451             comb += req.hit_way.eq(req_hit_way)
1452             comb += req.same_tag.eq(req_same_tag)
1453
1454             # Store the incoming request from r0,
1455             # if it is a slow request
1456             # Note that r1.full = 1 implies req_op = OP_NONE
1457             with m.If((req_op == Op.OP_LOAD_MISS)
1458                       | (req_op == Op.OP_LOAD_NC)
1459                       | (req_op == Op.OP_STORE_MISS)
1460                       | (req_op == Op.OP_STORE_HIT)):
1461                 sync += r1.req.eq(req)
1462                 sync += r1.full.eq(1)
1463                 # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
1464                 # destroy r1.req by overwriting r1.full back to zero
1465                 comb += r1_next_cycle.eq(1)
1466
1467         # Main state machine
1468         with m.Switch(r1.state):
1469
1470             with m.Case(State.IDLE):
1471                 sync += r1.wb.adr.eq(req.real_addr[ROW_OFF_BITS:])
1472                 sync += r1.wb.sel.eq(req.byte_sel)
1473                 sync += r1.wb.dat.eq(req.data)
1474                 sync += r1.dcbz.eq(req.dcbz)
1475
1476                 # Keep track of our index and way
1477                 # for subsequent stores.
1478                 sync += r1.store_index.eq(req_idx)
1479                 sync += r1.store_row.eq(req_row)
1480                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1481                 sync += r1.reload_tag.eq(req_tag)
1482                 sync += r1.req.same_tag.eq(1)
1483
1484                 with m.If(req.op == Op.OP_STORE_HIT):
1485                     sync += r1.store_way.eq(req.hit_way)
1486
1487                 #with m.If(r1.dec_acks):
1488                 #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
1489
1490                 # Reset per-row valid bits,
1491                 # ready for handling OP_LOAD_MISS
1492                 for i in range(ROW_PER_LINE):
1493                     sync += r1.rows_valid[i].eq(0)
1494
1495                 with m.If(req_op != Op.OP_NONE):
1496                     sync += Display("cache op %d", req.op)
1497
1498                 with m.Switch(req.op):
1499                     with m.Case(Op.OP_LOAD_HIT):
1500                         # stay in IDLE state
1501                         pass
1502
1503                     with m.Case(Op.OP_LOAD_MISS):
1504                         sync += Display("cache miss real addr: %x " \
1505                                 "idx: %x tag: %x",
1506                                 req.real_addr, req_row, req_tag)
1507
1508                         # Start the wishbone cycle
1509                         sync += r1.wb.we.eq(0)
1510                         sync += r1.wb.cyc.eq(1)
1511                         sync += r1.wb.stb.eq(1)
1512
1513                         # Track that we had one request sent
1514                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1515                         sync += r1.write_tag.eq(1)
1516
1517                     with m.Case(Op.OP_LOAD_NC):
1518                         sync += r1.wb.cyc.eq(1)
1519                         sync += r1.wb.stb.eq(1)
1520                         sync += r1.wb.we.eq(0)
1521                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1522
1523                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1524                         with m.If(~req.dcbz):
1525                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1526                             sync += r1.acks_pending.eq(1)
1527                             sync += r1.full.eq(0)
1528                             comb += r1_next_cycle.eq(0)
1529                             sync += r1.slow_valid.eq(1)
1530
1531                             with m.If(req.mmu_req):
1532                                 sync += r1.mmu_done.eq(1)
1533                             with m.Else():
1534                                 sync += r1.ls_valid.eq(1)
1535
1536                             with m.If(req.op == Op.OP_STORE_HIT):
1537                                 sync += r1.write_bram.eq(1)
1538                         with m.Else():
1539                             # dcbz is handled much like a load miss except
1540                             # that we are writing to memory instead of reading
1541                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1542
1543                             with m.If(req.op == Op.OP_STORE_MISS):
1544                                 sync += r1.write_tag.eq(1)
1545
1546                         sync += r1.wb.we.eq(1)
1547                         sync += r1.wb.cyc.eq(1)
1548                         sync += r1.wb.stb.eq(1)
1549
1550                     # OP_NONE and OP_BAD do nothing
1551                     # OP_BAD & OP_STCX_FAIL were
1552                     # handled above already
1553                     with m.Case(Op.OP_NONE):
1554                         pass
1555                     with m.Case(Op.OP_BAD):
1556                         pass
1557                     with m.Case(Op.OP_STCX_FAIL):
1558                         pass
1559
1560             with m.Case(State.RELOAD_WAIT_ACK):
1561                 ld_stbs_done = Signal()
1562                 # Requests are all sent if stb is 0
1563                 comb += ld_stbs_done.eq(~r1.wb.stb)
1564
1565                 # If we are still sending requests, was one accepted?
1566                 with m.If((~bus.stall) & r1.wb.stb):
1567                     # That was the last word?  We are done sending.
1568                     # Clear stb and set ld_stbs_done so we can handle an
1569                     # eventual last ack on the same cycle.
1570                     # sigh - reconstruct wb adr with 3 extra 0s at front
1571                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1572                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1573                         sync += r1.wb.stb.eq(0)
1574                         comb += ld_stbs_done.eq(1)
1575
1576                     # Calculate the next row address in the current cache line
1577                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1578                     comb += row.eq(r1.wb.adr)
1579                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1580
1581                 # Incoming acks processing
1582                 sync += r1.forward_valid1.eq(bus.ack)
1583                 with m.If(bus.ack):
1584                     srow = Signal(ROW_LINE_BITS)
1585                     comb += srow.eq(r1.store_row)
1586                     sync += r1.rows_valid[srow].eq(1)
1587
1588                     # If this is the data we were looking for,
1589                     # we can complete the request next cycle.
1590                     # Compare the whole address in case the
1591                     # request in r1.req is not the one that
1592                     # started this refill.
1593                     with m.If(r1.full & r1.req.same_tag &
1594                               ((r1.dcbz & req.dcbz) |
1595                                (r1.req.op == Op.OP_LOAD_MISS)) &
1596                                 (r1.store_row == get_row(r1.req.real_addr))):
1597                         sync += r1.full.eq(r1_next_cycle)
1598                         sync += r1.slow_valid.eq(1)
1599                         with m.If(r1.mmu_req):
1600                             sync += r1.mmu_done.eq(1)
1601                         with m.Else():
1602                             sync += r1.ls_valid.eq(1)
1603                         sync += r1.forward_sel.eq(~0) # all 1s
1604                         sync += r1.use_forward1.eq(1)
1605
1606                     # Check for completion
1607                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1608                                                       r1.end_row_ix)):
1609                         # Complete wishbone cycle
1610                         sync += r1.wb.cyc.eq(0)
1611
1612                         # Cache line is now valid
1613                         cv = Signal(INDEX_BITS)
1614                         comb += cv.eq(cache_valids[r1.store_index])
1615                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1616                         sync += cache_valids[r1.store_index].eq(cv)
1617
1618                         sync += r1.state.eq(State.IDLE)
1619                         sync += Display("cache valid set %x "
1620                                         "idx %d way %d",
1621                                          cv, r1.store_index, r1.store_way)
1622
1623                     # Increment store row counter
1624                     sync += r1.store_row.eq(next_row(r1.store_row))
1625
1626             with m.Case(State.STORE_WAIT_ACK):
1627                 st_stbs_done = Signal()
1628                 adjust_acks = Signal(3)
1629
1630                 comb += st_stbs_done.eq(~r1.wb.stb)
1631
1632                 with m.If(r1.inc_acks != r1.dec_acks):
1633                     with m.If(r1.inc_acks):
1634                         comb += adjust_acks.eq(r1.acks_pending + 1)
1635                     with m.Else():
1636                         comb += adjust_acks.eq(r1.acks_pending - 1)
1637                 with m.Else():
1638                     comb += adjust_acks.eq(r1.acks_pending)
1639
1640                 sync += r1.acks_pending.eq(adjust_acks)
1641
1642                 # Clear stb when slave accepted request
1643                 with m.If(~bus.stall):
1644                     # See if there is another store waiting
1645                     # to be done which is in the same real page.
1646                     # (this is when same_tsg is true)
1647                     with m.If(req.valid):
1648                         _ra = req.real_addr[ROW_OFF_BITS:SET_SIZE_BITS]
1649                         sync += r1.wb.adr[0:SET_SIZE_BITS-ROW_OFF_BITS].eq(_ra)
1650                         sync += r1.wb.dat.eq(req.data)
1651                         sync += r1.wb.sel.eq(req.byte_sel)
1652
1653                     with m.If((adjust_acks < 7) & req.same_tag &
1654                                 ((req.op == Op.OP_STORE_MISS) |
1655                                  (req.op == Op.OP_STORE_HIT))):
1656                         sync += r1.wb.stb.eq(1)
1657                         comb += st_stbs_done.eq(0)
1658                         sync += r1.store_way.eq(req.hit_way)
1659                         sync += r1.store_row.eq(get_row(req.real_addr))
1660
1661                         with m.If(req.op == Op.OP_STORE_HIT):
1662                             sync += r1.write_bram.eq(1)
1663                         sync += r1.full.eq(r1_next_cycle)
1664                         sync += r1.slow_valid.eq(1)
1665
1666                         # Store requests never come from the MMU
1667                         sync += r1.ls_valid.eq(1)
1668                         comb += st_stbs_done.eq(0)
1669                         sync += r1.inc_acks.eq(1)
1670                     with m.Else():
1671                         sync += r1.wb.stb.eq(0)
1672                         comb += st_stbs_done.eq(1)
1673
1674                 # Got ack ? See if complete.
1675                 sync += Display("got ack %d %d stbs %d adjust_acks %d",
1676                                 bus.ack, bus.ack, st_stbs_done, adjust_acks)
1677                 with m.If(bus.ack):
1678                     with m.If(st_stbs_done & (adjust_acks == 1)):
1679                         sync += r1.state.eq(State.IDLE)
1680                         sync += r1.wb.cyc.eq(0)
1681                         sync += r1.wb.stb.eq(0)
1682                     sync += r1.dec_acks.eq(1)
1683
1684             with m.Case(State.NC_LOAD_WAIT_ACK):
1685                 # Clear stb when slave accepted request
1686                 with m.If(~bus.stall):
1687                     sync += r1.wb.stb.eq(0)
1688
1689                 # Got ack ? complete.
1690                 with m.If(bus.ack):
1691                     sync += r1.state.eq(State.IDLE)
1692                     sync += r1.full.eq(r1_next_cycle)
1693                     sync += r1.slow_valid.eq(1)
1694
1695                     with m.If(r1.mmu_req):
1696                         sync += r1.mmu_done.eq(1)
1697                     with m.Else():
1698                         sync += r1.ls_valid.eq(1)
1699
1700                     sync += r1.forward_sel.eq(~0) # all 1s
1701                     sync += r1.use_forward1.eq(1)
1702                     sync += r1.wb.cyc.eq(0)
1703                     sync += r1.wb.stb.eq(0)
1704
1705     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1706
1707         sync = m.d.sync
1708         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1709
1710         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1711                                stall_out, req_op[:3], d_out.valid, d_out.error,
1712                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1713                                r1.real_adr[3:6]))
1714
1715     def elaborate(self, platform):
1716
1717         m = Module()
1718         comb, sync = m.d.comb, m.d.sync
1719         m_in, d_in = self.m_in, self.d_in
1720
1721         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1722         cache_valids     = CacheValidsArray()
1723         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1724
1725         self.tagmem = Memory(depth=NUM_LINES, width=TAG_RAM_WIDTH)
1726
1727         """note: these are passed to nmigen.hdl.Memory as "attributes".
1728            don't know how, just that they are.
1729         """
1730         # TODO attribute ram_style of
1731         #  dtlb_tags : signal is "distributed";
1732         # TODO attribute ram_style of
1733         #  dtlb_ptes : signal is "distributed";
1734
1735         r0      = RegStage0("r0")
1736         r0_full = Signal()
1737
1738         r1 = RegStage1("r1")
1739
1740         reservation = Reservation("rsrv")
1741
1742         # Async signals on incoming request
1743         req_index    = Signal(INDEX_BITS)
1744         req_row      = Signal(ROW_BITS)
1745         req_hit_way  = Signal(WAY_BITS)
1746         req_tag      = Signal(TAG_BITS)
1747         req_op       = Signal(Op)
1748         req_data     = Signal(64)
1749         req_same_tag = Signal()
1750         req_go       = Signal()
1751
1752         early_req_row     = Signal(ROW_BITS)
1753
1754         cancel_store      = Signal()
1755         set_rsrv          = Signal()
1756         clear_rsrv        = Signal()
1757
1758         r0_valid          = Signal()
1759         r0_stall          = Signal()
1760
1761         use_forward1_next = Signal()
1762         use_forward2_next = Signal()
1763
1764         cache_out_row     = Signal(WB_DATA_BITS)
1765
1766         plru_victim       = Signal(WAY_BITS)
1767         replace_way       = Signal(WAY_BITS)
1768
1769         # Wishbone read/write/cache write formatting signals
1770         bus_sel           = Signal(8)
1771
1772         # TLB signals
1773         tlb_way       = TLBRecord("tlb_way")
1774         tlb_req_index = Signal(TLB_SET_BITS)
1775         tlb_hit       = TLBHit("tlb_hit")
1776         pte           = Signal(TLB_PTE_BITS)
1777         ra            = Signal(REAL_ADDR_BITS)
1778         valid_ra      = Signal()
1779         perm_attr     = PermAttr("dc_perms")
1780         rc_ok         = Signal()
1781         perm_ok       = Signal()
1782         access_ok     = Signal()
1783
1784         tlb_plru_victim = Signal(TLB_WAY_BITS)
1785
1786         # we don't yet handle collisions between loadstore1 requests
1787         # and MMU requests
1788         comb += self.m_out.stall.eq(0)
1789
1790         # Hold off the request in r0 when r1 has an uncompleted request
1791         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1792         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1793         comb += self.stall_out.eq(r0_stall)
1794         # debugging: detect if any stall ever requested, which is fine,
1795         # but if a request comes in when stall requested, that's bad.
1796         with m.If(r0_stall):
1797             sync += self.any_stall_out.eq(1)
1798             with m.If(d_in.valid):
1799                 sync += self.dreq_when_stall.eq(1)
1800             with m.If(m_in.valid):
1801                 sync += self.mreq_when_stall.eq(1)
1802
1803         # deal with litex not doing wishbone pipeline mode
1804         # XXX in wrong way.  FIFOs are needed in the SRAM test
1805         # so that stb/ack match up. same thing done in icache.py
1806         if not self.microwatt_compat:
1807             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1808
1809         # Wire up wishbone request latch out of stage 1
1810         comb += self.bus.we.eq(r1.wb.we)
1811         comb += self.bus.adr.eq(r1.wb.adr)
1812         comb += self.bus.sel.eq(r1.wb.sel)
1813         comb += self.bus.stb.eq(r1.wb.stb)
1814         comb += self.bus.dat_w.eq(r1.wb.dat)
1815         comb += self.bus.cyc.eq(r1.wb.cyc)
1816
1817         # create submodule TLBUpdate
1818         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1819
1820         # call sub-functions putting everything together, using shared
1821         # signals established above
1822         self.stage_0(m, r0, r1, r0_full)
1823         self.tlb_read(m, r0_stall, tlb_way)
1824         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1825                         tlb_way,
1826                         pte, tlb_hit, valid_ra, perm_attr, ra)
1827         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1828                         tlb_hit, tlb_plru_victim)
1829         self.maybe_plrus(m, r1, plru_victim)
1830         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1831         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set)
1832         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1833                            r0_valid, r1, cache_valids, replace_way,
1834                            use_forward1_next, use_forward2_next,
1835                            req_hit_way, plru_victim, rc_ok, perm_attr,
1836                            valid_ra, perm_ok, access_ok, req_op, req_go,
1837                            tlb_hit, tlb_way, cache_tag_set,
1838                            cancel_store, req_same_tag, r0_stall, early_req_row)
1839         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1840                            r0_valid, r0, reservation)
1841         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1842                            reservation, r0)
1843         self.writeback_control(m, r1, cache_out_row)
1844         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1845         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1846                         req_hit_way, req_index, req_tag, access_ok,
1847                         tlb_hit, tlb_req_index)
1848         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1849                     r0, replace_way,
1850                     req_hit_way, req_same_tag,
1851                          r0_valid, req_op, cache_valids, req_go, ra)
1852         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1853
1854         return m
1855
1856
1857 if __name__ == '__main__':
1858     dut = DCache()
1859     vl = rtlil.convert(dut, ports=[])
1860     with open("test_dcache.il", "w") as f:
1861         f.write(vl)