src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record, Memory)
  29 from nmutil.util import Display
  30 from nmigen.lib.coding import Decoder
  31
  32 from copy import deepcopy
  33 from random import randint, seed
  34
  35 from nmigen_soc.wishbone.bus import Interface
  36
  37 from nmigen.cli import main
  38 from nmutil.iocontrol import RecordObject
  39 from nmigen.utils import log2_int
  40 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  41                                      DCacheToLoadStore1Type,
  42                                      MMUToDCacheType,
  43                                      DCacheToMMUType)
  44
  45 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  46                                 WBAddrType, WBDataType, WBSelType,
  47                                 WBMasterOut, WBSlaveOut,
  48                                 WBMasterOutVector, WBSlaveOutVector,
  49                                 WBIOMasterOut, WBIOSlaveOut)
  50
  51 from soc.experiment.cache_ram import CacheRam
  52 #from soc.experiment.plru import PLRU
  53 from nmutil.plru import PLRU, PLRUs
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmigen.cli import rtlil
  59
  60 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  61 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  62 from nmutil.sim_tmp_alternative import Simulator
  63
  64 from nmutil.util import wrap
  65
  66
  67 # TODO: make these parameters of DCache at some point
  68 LINE_SIZE = 64    # Line size in bytes
  69 NUM_LINES = 16    # Number of lines in a set
  70 NUM_WAYS = 4      # Number of ways
  71 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  72 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  73 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  74 LOG_LENGTH = 0    # Non-zero to enable log data collection
  75
  76 # BRAM organisation: We never access more than
  77 #     -- WB_DATA_BITS at a time so to save
  78 #     -- resources we make the array only that wide, and
  79 #     -- use consecutive indices to make a cache "line"
  80 #     --
  81 #     -- ROW_SIZE is the width in bytes of the BRAM
  82 #     -- (based on WB, so 64-bits)
  83 ROW_SIZE = WB_DATA_BITS // 8;
  84
  85 # ROW_PER_LINE is the number of row (wishbone
  86 # transactions) in a line
  87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  88
  89 # BRAM_ROWS is the number of rows in BRAM needed
  90 # to represent the full dcache
  91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  92
  93 print ("ROW_SIZE", ROW_SIZE)
  94 print ("ROW_PER_LINE", ROW_PER_LINE)
  95 print ("BRAM_ROWS", BRAM_ROWS)
  96 print ("NUM_WAYS", NUM_WAYS)
  97
  98 # Bit fields counts in the address
  99
 100 # REAL_ADDR_BITS is the number of real address
 101 # bits that we store
 102 REAL_ADDR_BITS = 56
 103
 104 # ROW_BITS is the number of bits to select a row
 105 ROW_BITS = log2_int(BRAM_ROWS)
 106
 107 # ROW_LINE_BITS is the number of bits to select
 108 # a row within a line
 109 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 110
 111 # LINE_OFF_BITS is the number of bits for
 112 # the offset in a cache line
 113 LINE_OFF_BITS = log2_int(LINE_SIZE)
 114
 115 # ROW_OFF_BITS is the number of bits for
 116 # the offset in a row
 117 ROW_OFF_BITS = log2_int(ROW_SIZE)
 118
 119 # INDEX_BITS is the number if bits to
 120 # select a cache line
 121 INDEX_BITS = log2_int(NUM_LINES)
 122
 123 # SET_SIZE_BITS is the log base 2 of the set size
 124 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 125
 126 # TAG_BITS is the number of bits of
 127 # the tag part of the address
 128 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 129
 130 # TAG_WIDTH is the width in bits of each way of the tag RAM
 131 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 132
 133 # WAY_BITS is the number of bits to select a way
 134 WAY_BITS = log2_int(NUM_WAYS)
 135
 136 # Example of layout for 32 lines of 64 bytes:
 137 layout = """\
 138   ..  tag    |index|  line  |
 139   ..         |   row   |    |
 140   ..         |     |---|    | ROW_LINE_BITS  (3)
 141   ..         |     |--- - --| LINE_OFF_BITS (6)
 142   ..         |         |- --| ROW_OFF_BITS  (3)
 143   ..         |----- ---|    | ROW_BITS      (8)
 144   ..         |-----|        | INDEX_BITS    (5)
 145   .. --------|              | TAG_BITS      (45)
 146 """
 147 print (layout)
 148 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 149             (TAG_BITS, INDEX_BITS, ROW_BITS,
 150              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 151 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 152 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 153 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 154
 155 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 156
 157 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 158 print ("    TAG_WIDTH", TAG_WIDTH)
 159 print ("     NUM_WAYS", NUM_WAYS)
 160
 161 def CacheTagArray():
 162     tag_layout = [('valid', 1),
 163                   ('tag', TAG_RAM_WIDTH),
 164                  ]
 165     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 166
 167 def RowPerLineValidArray():
 168     return Array(Signal(name="rows_valid%d" % x) \
 169                         for x in range(ROW_PER_LINE))
 170
 171 # L1 TLB
 172 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 173 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 174 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 175 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 176 TLB_PTE_BITS     = 64
 177 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 178
 179 def ispow2(x):
 180     return (1<<log2_int(x, False)) == x
 181
 182 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 183 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 184 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 185 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 186 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 187 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 188         "geometry bits don't add up"
 189 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 190         "geometry bits don't add up"
 191 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 192          "geometry bits don't add up"
 193 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 194 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 195
 196
 197 def TLBHit(name):
 198     return Record([('valid', 1),
 199                    ('way', TLB_WAY_BITS)], name=name)
 200
 201 def TLBTagEAArray():
 202     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 203                 for x in range (TLB_NUM_WAYS))
 204
 205 def TLBRecord(name):
 206     tlb_layout = [('valid', TLB_NUM_WAYS),
 207                   ('tag', TLB_TAG_WAY_BITS),
 208                   ('pte', TLB_PTE_WAY_BITS)
 209                  ]
 210     return Record(tlb_layout, name=name)
 211
 212 def TLBArray():
 213     return Array(TLBRecord(name="tlb%d" % x) for x in range(TLB_SET_SIZE))
 214
 215 def HitWaySet():
 216     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 217                         for x in range(TLB_NUM_WAYS))
 218
 219 # Cache RAM interface
 220 def CacheRamOut():
 221     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 222                  for x in range(NUM_WAYS))
 223
 224 # PLRU output interface
 225 def PLRUOut():
 226     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 227                 for x in range(NUM_LINES))
 228
 229 # TLB PLRU output interface
 230 def TLBPLRUOut():
 231     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 232                 for x in range(TLB_SET_SIZE))
 233
 234 # Helper functions to decode incoming requests
 235 #
 236 # Return the cache line index (tag index) for an address
 237 def get_index(addr):
 238     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 239
 240 # Return the cache row index (data memory) for an address
 241 def get_row(addr):
 242     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 243
 244 # Return the index of a row within a line
 245 def get_row_of_line(row):
 246     return row[:ROW_BITS][:ROW_LINE_BITS]
 247
 248 # Returns whether this is the last row of a line
 249 def is_last_row_addr(addr, last):
 250     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 251
 252 # Returns whether this is the last row of a line
 253 def is_last_row(row, last):
 254     return get_row_of_line(row) == last
 255
 256 # Return the next row in the current cache line. We use a
 257 # dedicated function in order to limit the size of the
 258 # generated adder to be only the bits within a cache line
 259 # (3 bits with default settings)
 260 def next_row(row):
 261     row_v = row[0:ROW_LINE_BITS] + 1
 262     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 263
 264 # Get the tag value from the address
 265 def get_tag(addr):
 266     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 267
 268 # Read a tag from a tag memory row
 269 def read_tag(way, tagset):
 270     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 271
 272 # Read a TLB tag from a TLB tag memory row
 273 def read_tlb_tag(way, tags):
 274     return tags.word_select(way, TLB_EA_TAG_BITS)
 275
 276 # Write a TLB tag to a TLB tag memory row
 277 def write_tlb_tag(way, tags, tag):
 278     return read_tlb_tag(way, tags).eq(tag)
 279
 280 # Read a PTE from a TLB PTE memory row
 281 def read_tlb_pte(way, ptes):
 282     return ptes.word_select(way, TLB_PTE_BITS)
 283
 284 def write_tlb_pte(way, ptes, newpte):
 285     return read_tlb_pte(way, ptes).eq(newpte)
 286
 287
 288 # Record for storing permission, attribute, etc. bits from a PTE
 289 class PermAttr(RecordObject):
 290     def __init__(self, name=None):
 291         super().__init__(name=name)
 292         self.reference = Signal()
 293         self.changed   = Signal()
 294         self.nocache   = Signal()
 295         self.priv      = Signal()
 296         self.rd_perm   = Signal()
 297         self.wr_perm   = Signal()
 298
 299
 300 def extract_perm_attr(pte):
 301     pa = PermAttr()
 302     return pa;
 303
 304
 305 # Type of operation on a "valid" input
 306 @unique
 307 class Op(Enum):
 308     OP_NONE       = 0
 309     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 310     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 311     OP_LOAD_HIT   = 3 # Cache hit on load
 312     OP_LOAD_MISS  = 4 # Load missing cache
 313     OP_LOAD_NC    = 5 # Non-cachable load
 314     OP_STORE_HIT  = 6 # Store hitting cache
 315     OP_STORE_MISS = 7 # Store missing cache
 316
 317
 318 # Cache state machine
 319 @unique
 320 class State(Enum):
 321     IDLE             = 0 # Normal load hit processing
 322     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 323     STORE_WAIT_ACK   = 2 # Store wait ack
 324     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 325
 326
 327 # Dcache operations:
 328 #
 329 # In order to make timing, we use the BRAMs with
 330 # an output buffer, which means that the BRAM
 331 # output is delayed by an extra cycle.
 332 #
 333 # Thus, the dcache has a 2-stage internal pipeline
 334 # for cache hits with no stalls.
 335 #
 336 # All other operations are handled via stalling
 337 # in the first stage.
 338 #
 339 # The second stage can thus complete a hit at the same
 340 # time as the first stage emits a stall for a complex op.
 341 #
 342 # Stage 0 register, basically contains just the latched request
 343
 344 class RegStage0(RecordObject):
 345     def __init__(self, name=None):
 346         super().__init__(name=name)
 347         self.req     = LoadStore1ToDCacheType(name="lsmem")
 348         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 349         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 350         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 351         self.mmu_req = Signal() # indicates source of request
 352         self.d_valid = Signal() # indicates req.data is valid now
 353
 354
 355 class MemAccessRequest(RecordObject):
 356     def __init__(self, name=None):
 357         super().__init__(name=name)
 358         self.op        = Signal(Op)
 359         self.valid     = Signal()
 360         self.dcbz      = Signal()
 361         self.real_addr = Signal(REAL_ADDR_BITS)
 362         self.data      = Signal(64)
 363         self.byte_sel  = Signal(8)
 364         self.hit_way   = Signal(WAY_BITS)
 365         self.same_tag  = Signal()
 366         self.mmu_req   = Signal()
 367
 368
 369 # First stage register, contains state for stage 1 of load hits
 370 # and for the state machine used by all other operations
 371 class RegStage1(RecordObject):
 372     def __init__(self, name=None):
 373         super().__init__(name=name)
 374         # Info about the request
 375         self.full             = Signal() # have uncompleted request
 376         self.mmu_req          = Signal() # request is from MMU
 377         self.req              = MemAccessRequest(name="reqmem")
 378
 379         # Cache hit state
 380         self.hit_way          = Signal(WAY_BITS)
 381         self.hit_load_valid   = Signal()
 382         self.hit_index        = Signal(INDEX_BITS)
 383         self.cache_hit        = Signal()
 384
 385         # TLB hit state
 386         self.tlb_hit          = TLBHit("tlb_hit")
 387         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 388
 389         # 2-stage data buffer for data forwarded from writes to reads
 390         self.forward_data1    = Signal(64)
 391         self.forward_data2    = Signal(64)
 392         self.forward_sel1     = Signal(8)
 393         self.forward_valid1   = Signal()
 394         self.forward_way1     = Signal(WAY_BITS)
 395         self.forward_row1     = Signal(ROW_BITS)
 396         self.use_forward1     = Signal()
 397         self.forward_sel      = Signal(8)
 398
 399         # Cache miss state (reload state machine)
 400         self.state            = Signal(State)
 401         self.dcbz             = Signal()
 402         self.write_bram       = Signal()
 403         self.write_tag        = Signal()
 404         self.slow_valid       = Signal()
 405         self.wb               = WBMasterOut("wb")
 406         self.reload_tag       = Signal(TAG_BITS)
 407         self.store_way        = Signal(WAY_BITS)
 408         self.store_row        = Signal(ROW_BITS)
 409         self.store_index      = Signal(INDEX_BITS)
 410         self.end_row_ix       = Signal(ROW_LINE_BITS)
 411         self.rows_valid       = RowPerLineValidArray()
 412         self.acks_pending     = Signal(3)
 413         self.inc_acks         = Signal()
 414         self.dec_acks         = Signal()
 415
 416         # Signals to complete (possibly with error)
 417         self.ls_valid         = Signal()
 418         self.ls_error         = Signal()
 419         self.mmu_done         = Signal()
 420         self.mmu_error        = Signal()
 421         self.cache_paradox    = Signal()
 422
 423         # Signal to complete a failed stcx.
 424         self.stcx_fail        = Signal()
 425
 426
 427 # Reservation information
 428 class Reservation(RecordObject):
 429     def __init__(self):
 430         super().__init__()
 431         self.valid = Signal()
 432         self.addr  = Signal(64-LINE_OFF_BITS)
 433
 434
 435 class DTLBUpdate(Elaboratable):
 436     def __init__(self):
 437         self.dtlb     = TLBArray()
 438         self.tlbie    = Signal()
 439         self.tlbwe    = Signal()
 440         self.doall    = Signal()
 441         self.tlb_hit     = TLBHit("tlb_hit")
 442         self.tlb_req_index = Signal(TLB_SET_BITS)
 443
 444         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 445         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 446         self.repl_way        = Signal(TLB_WAY_BITS)
 447         self.eatag           = Signal(TLB_EA_TAG_BITS)
 448         self.pte_data        = Signal(TLB_PTE_BITS)
 449
 450         # read from dtlb array
 451         self.tlb_read       = Signal()
 452         self.tlb_read_index = Signal(TLB_SET_BITS)
 453         self.tlb_way        = TLBRecord("o_tlb_way")
 454
 455     def elaborate(self, platform):
 456         m = Module()
 457         comb = m.d.comb
 458         sync = m.d.sync
 459
 460         dtlb, tlb_req_index = self.dtlb, self.tlb_req_index
 461
 462         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 463         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 464         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 465         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 466         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 467         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 468
 469         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 470         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 471         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 472         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 473                                     granularity=TLB_EA_TAG_BITS)
 474
 475         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 476         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 477         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 478                                     granularity=TLB_PTE_BITS)
 479
 480         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 481         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 482         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 483         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 484
 485         tagset   = Signal(TLB_TAG_WAY_BITS)
 486         pteset   = Signal(TLB_PTE_WAY_BITS)
 487         updated  = Signal()
 488         v_updated  = Signal()
 489         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 490         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 491         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 492         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 493
 494         comb += dv.eq(dtlb[tlb_req_index].valid)
 495         comb += db_out.eq(dv)
 496
 497         with m.If(self.tlbie & self.doall):
 498             # clear all valid bits at once
 499             for i in range(TLB_SET_SIZE):
 500                 sync += dtlb[i].valid.eq(0)
 501         with m.Elif(self.tlbie):
 502             # invalidate just the hit_way
 503             with m.If(self.tlb_hit.valid):
 504                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 505                 comb += v_updated.eq(1)
 506         with m.Elif(self.tlbwe):
 507             # write to the requested tag and PTE
 508             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 509             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 510             # set valid bit
 511             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 512
 513             comb += updated.eq(1)
 514             comb += v_updated.eq(1)
 515
 516         with m.If(updated):
 517             comb += wr_pteway.data.eq(pb_out)
 518             comb += wr_pteway.en.eq(1<<self.repl_way)
 519             comb += wr_tagway.data.eq(tb_out)
 520             comb += wr_tagway.en.eq(1<<self.repl_way)
 521         with m.If(v_updated):
 522             sync += dtlb[tlb_req_index].valid.eq(db_out)
 523
 524         # select one TLB way
 525         r_tlb_way        = TLBRecord("r_tlb_way")
 526         r_delay = Signal()
 527         sync += r_delay.eq(self.tlb_read)
 528         with m.If(self.tlb_read):
 529             sync += self.tlb_way.valid.eq(dtlb[self.tlb_read_index].valid)
 530         with m.If(r_delay):
 531             comb += self.tlb_way.tag.eq(rd_tagway.data)
 532             comb += self.tlb_way.pte.eq(rd_pteway.data)
 533             sync += r_tlb_way.tag.eq(rd_tagway.data)
 534             sync += r_tlb_way.pte.eq(rd_pteway.data)
 535         with m.Else():
 536             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 537             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 538
 539         return m
 540
 541
 542 class DCachePendingHit(Elaboratable):
 543
 544     def __init__(self, tlb_way,
 545                       cache_i_validdx, cache_tag_set,
 546                     req_addr,
 547                     hit_set):
 548
 549         self.go          = Signal()
 550         self.virt_mode   = Signal()
 551         self.is_hit      = Signal()
 552         self.tlb_hit      = TLBHit("tlb_hit")
 553         self.hit_way     = Signal(WAY_BITS)
 554         self.rel_match   = Signal()
 555         self.req_index   = Signal(INDEX_BITS)
 556         self.reload_tag  = Signal(TAG_BITS)
 557
 558         self.tlb_way = tlb_way
 559         self.cache_i_validdx = cache_i_validdx
 560         self.cache_tag_set = cache_tag_set
 561         self.req_addr = req_addr
 562         self.hit_set = hit_set
 563
 564     def elaborate(self, platform):
 565         m = Module()
 566         comb = m.d.comb
 567         sync = m.d.sync
 568
 569         go = self.go
 570         virt_mode = self.virt_mode
 571         is_hit = self.is_hit
 572         tlb_way = self.tlb_way
 573         cache_i_validdx = self.cache_i_validdx
 574         cache_tag_set = self.cache_tag_set
 575         req_addr = self.req_addr
 576         tlb_hit = self.tlb_hit
 577         hit_set = self.hit_set
 578         hit_way = self.hit_way
 579         rel_match = self.rel_match
 580         req_index = self.req_index
 581         reload_tag = self.reload_tag
 582
 583         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 584                                     for i in range(TLB_NUM_WAYS))
 585         hit_way_set = HitWaySet()
 586
 587         # Test if pending request is a hit on any way
 588         # In order to make timing in virtual mode,
 589         # when we are using the TLB, we compare each
 590         # way with each of the real addresses from each way of
 591         # the TLB, and then decide later which match to use.
 592
 593         with m.If(virt_mode):
 594             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 595                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 596                 s_hit       = Signal()
 597                 s_pte       = Signal(TLB_PTE_BITS)
 598                 s_ra        = Signal(REAL_ADDR_BITS)
 599                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 600                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 601                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 602                 comb += s_tag.eq(get_tag(s_ra))
 603
 604                 for i in range(NUM_WAYS): # way_t
 605                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 606                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 607                                   (read_tag(i, cache_tag_set) == s_tag)
 608                                   & (tlb_way.valid[j]))
 609                     with m.If(is_tag_hit):
 610                         comb += hit_way_set[j].eq(i)
 611                         comb += s_hit.eq(1)
 612                 comb += hit_set[j].eq(s_hit)
 613                 with m.If(s_tag == reload_tag):
 614                     comb += rel_matches[j].eq(1)
 615             with m.If(tlb_hit.valid):
 616                 comb += is_hit.eq(hit_set[tlb_hit.way])
 617                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 618                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 619         with m.Else():
 620             s_tag       = Signal(TAG_BITS)
 621             comb += s_tag.eq(get_tag(req_addr))
 622             for i in range(NUM_WAYS): # way_t
 623                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 624                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 625                           (read_tag(i, cache_tag_set) == s_tag))
 626                 with m.If(is_tag_hit):
 627                     comb += hit_way.eq(i)
 628                     comb += is_hit.eq(1)
 629             with m.If(s_tag == reload_tag):
 630                 comb += rel_match.eq(1)
 631
 632         return m
 633
 634
 635 class DCache(Elaboratable):
 636     """Set associative dcache write-through
 637
 638     TODO (in no specific order):
 639     * See list in icache.vhdl
 640     * Complete load misses on the cycle when WB data comes instead of
 641       at the end of line (this requires dealing with requests coming in
 642       while not idle...)
 643     """
 644     def __init__(self):
 645         self.d_in      = LoadStore1ToDCacheType("d_in")
 646         self.d_out     = DCacheToLoadStore1Type("d_out")
 647
 648         self.m_in      = MMUToDCacheType("m_in")
 649         self.m_out     = DCacheToMMUType("m_out")
 650
 651         self.stall_out = Signal()
 652
 653         # standard naming (wired to non-standard for compatibility)
 654         self.bus = Interface(addr_width=32,
 655                             data_width=64,
 656                             granularity=8,
 657                             features={'stall'},
 658                             alignment=0,
 659                             name="dcache")
 660
 661         self.log_out   = Signal(20)
 662
 663     def stage_0(self, m, r0, r1, r0_full):
 664         """Latch the request in r0.req as long as we're not stalling
 665         """
 666         comb = m.d.comb
 667         sync = m.d.sync
 668         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 669
 670         r = RegStage0("stage0")
 671
 672         # TODO, this goes in unit tests and formal proofs
 673         with m.If(d_in.valid & m_in.valid):
 674             sync += Display("request collision loadstore vs MMU")
 675
 676         with m.If(m_in.valid):
 677             comb += r.req.valid.eq(1)
 678             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 679             comb += r.req.dcbz.eq(0)
 680             comb += r.req.nc.eq(0)
 681             comb += r.req.reserve.eq(0)
 682             comb += r.req.virt_mode.eq(0)
 683             comb += r.req.priv_mode.eq(1)
 684             comb += r.req.addr.eq(m_in.addr)
 685             comb += r.req.data.eq(m_in.pte)
 686             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 687             comb += r.tlbie.eq(m_in.tlbie)
 688             comb += r.doall.eq(m_in.doall)
 689             comb += r.tlbld.eq(m_in.tlbld)
 690             comb += r.mmu_req.eq(1)
 691             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 692                                  m_in.addr, m_in.pte, r.req.load)
 693
 694         with m.Else():
 695             comb += r.req.eq(d_in)
 696             comb += r.req.data.eq(0)
 697             comb += r.tlbie.eq(0)
 698             comb += r.doall.eq(0)
 699             comb += r.tlbld.eq(0)
 700             comb += r.mmu_req.eq(0)
 701         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 702             sync += r0.eq(r)
 703             sync += r0_full.eq(r.req.valid)
 704             # Sample data the cycle after a request comes in from loadstore1.
 705             # If another request has come in already then the data will get
 706             # put directly into req.data below.
 707             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 708                      ~r0.mmu_req):
 709                 sync += r0.req.data.eq(d_in.data)
 710                 sync += r0.d_valid.eq(1)
 711         with m.If(d_in.valid):
 712             m.d.sync += Display("    DCACHE req cache "
 713                                 "virt %d addr %x data %x ld %d",
 714                                  r.req.virt_mode, r.req.addr,
 715                                  r.req.data, r.req.load)
 716
 717     def tlb_read(self, m, r0_stall, tlb_way, dtlb):
 718         """TLB
 719         Operates in the second cycle on the request latched in r0.req.
 720         TLB updates write the entry at the end of the second cycle.
 721         """
 722         comb = m.d.comb
 723         sync = m.d.sync
 724         m_in, d_in = self.m_in, self.d_in
 725
 726         addrbits = Signal(TLB_SET_BITS)
 727
 728         amin = TLB_LG_PGSZ
 729         amax = TLB_LG_PGSZ + TLB_SET_BITS
 730
 731         with m.If(m_in.valid):
 732             comb += addrbits.eq(m_in.addr[amin : amax])
 733         with m.Else():
 734             comb += addrbits.eq(d_in.addr[amin : amax])
 735
 736         # If we have any op and the previous op isn't finished,
 737         # then keep the same output for next cycle.
 738         d = self.dtlb_update
 739         comb += d.tlb_read_index.eq(addrbits)
 740         comb += d.tlb_read.eq(~r0_stall)
 741         comb += tlb_way.eq(d.tlb_way)
 742
 743     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 744         """Generate TLB PLRUs
 745         """
 746         comb = m.d.comb
 747         sync = m.d.sync
 748
 749         if TLB_NUM_WAYS == 0:
 750             return
 751
 752         # Binary-to-Unary one-hot, enabled by tlb_hit valid
 753         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 754         m.submodules.tlb_plrus = tlb_plrus
 755         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 756         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 757         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 758         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 759         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 760
 761     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 762                    tlb_way,
 763                    pte, tlb_hit, valid_ra, perm_attr, ra):
 764
 765         comb = m.d.comb
 766
 767         hitway = Signal(TLB_WAY_BITS)
 768         hit    = Signal()
 769         eatag  = Signal(TLB_EA_TAG_BITS)
 770
 771         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 772         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 773         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 774
 775         for i in range(TLB_NUM_WAYS):
 776             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 777             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 778             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 779             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 780             with m.If(is_tag_hit):
 781                 comb += hitway.eq(i)
 782                 comb += hit.eq(1)
 783
 784         comb += tlb_hit.valid.eq(hit & r0_valid)
 785         comb += tlb_hit.way.eq(hitway)
 786
 787         with m.If(tlb_hit.valid):
 788             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 789         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 790
 791         with m.If(r0.req.virt_mode):
 792             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 793                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 794                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 795             comb += perm_attr.reference.eq(pte[8])
 796             comb += perm_attr.changed.eq(pte[7])
 797             comb += perm_attr.nocache.eq(pte[5])
 798             comb += perm_attr.priv.eq(pte[3])
 799             comb += perm_attr.rd_perm.eq(pte[2])
 800             comb += perm_attr.wr_perm.eq(pte[1])
 801         with m.Else():
 802             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 803                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 804             comb += perm_attr.reference.eq(1)
 805             comb += perm_attr.changed.eq(1)
 806             comb += perm_attr.nocache.eq(0)
 807             comb += perm_attr.priv.eq(1)
 808             comb += perm_attr.rd_perm.eq(1)
 809             comb += perm_attr.wr_perm.eq(1)
 810
 811         with m.If(valid_ra):
 812             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 813                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 814             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 815             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 816             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 817             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 818             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 819             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 820
 821     def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
 822                     tlb_hit, tlb_plru_victim, tlb_way):
 823
 824         comb = m.d.comb
 825         sync = m.d.sync
 826
 827         tlbie    = Signal()
 828         tlbwe    = Signal()
 829
 830         comb += tlbie.eq(r0_valid & r0.tlbie)
 831         comb += tlbwe.eq(r0_valid & r0.tlbld)
 832
 833         d = self.dtlb_update
 834
 835         comb += d.tlbie.eq(tlbie)
 836         comb += d.tlbwe.eq(tlbwe)
 837         comb += d.doall.eq(r0.doall)
 838         comb += d.tlb_hit.eq(tlb_hit)
 839         comb += d.tlb_tag_way.eq(tlb_way.tag)
 840         comb += d.tlb_pte_way.eq(tlb_way.pte)
 841         comb += d.tlb_req_index.eq(tlb_req_index)
 842
 843         with m.If(tlb_hit.valid):
 844             comb += d.repl_way.eq(tlb_hit.way)
 845         with m.Else():
 846             comb += d.repl_way.eq(tlb_plru_victim)
 847         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 848         comb += d.pte_data.eq(r0.req.data)
 849
 850     def maybe_plrus(self, m, r1, plru_victim):
 851         """Generate PLRUs
 852         """
 853         comb = m.d.comb
 854         sync = m.d.sync
 855
 856         if TLB_NUM_WAYS == 0:
 857             return
 858
 859         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 860         comb += plrus.way.eq(r1.hit_way)
 861         comb += plrus.valid.eq(r1.cache_hit)
 862         comb += plrus.index.eq(r1.hit_index)
 863         comb += plrus.isel.eq(r1.store_index) # select victim
 864         comb += plru_victim.eq(plrus.o_index) # selected victim
 865
 866     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 867         """Cache tag RAM read port
 868         """
 869         comb = m.d.comb
 870         sync = m.d.sync
 871         m_in, d_in = self.m_in, self.d_in
 872
 873         index = Signal(INDEX_BITS)
 874
 875         with m.If(r0_stall):
 876             comb += index.eq(req_index)
 877         with m.Elif(m_in.valid):
 878             comb += index.eq(get_index(m_in.addr))
 879         with m.Else():
 880             comb += index.eq(get_index(d_in.addr))
 881         sync += cache_tag_set.eq(cache_tags[index].tag)
 882
 883     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 884                        r0_valid, r1, cache_tags, replace_way,
 885                        use_forward1_next, use_forward2_next,
 886                        req_hit_way, plru_victim, rc_ok, perm_attr,
 887                        valid_ra, perm_ok, access_ok, req_op, req_go,
 888                        tlb_hit, tlb_way, cache_tag_set,
 889                        cancel_store, req_same_tag, r0_stall, early_req_row):
 890         """Cache request parsing and hit detection
 891         """
 892
 893         comb = m.d.comb
 894         m_in, d_in = self.m_in, self.d_in
 895
 896         is_hit      = Signal()
 897         hit_way     = Signal(WAY_BITS)
 898         op          = Signal(Op)
 899         opsel       = Signal(3)
 900         go          = Signal()
 901         nc          = Signal()
 902         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 903                                   for i in range(TLB_NUM_WAYS))
 904         cache_i_validdx = Signal(NUM_WAYS)
 905
 906         # Extract line, row and tag from request
 907         comb += req_index.eq(get_index(r0.req.addr))
 908         comb += req_row.eq(get_row(r0.req.addr))
 909         comb += req_tag.eq(get_tag(ra))
 910
 911         if False: # display on comb is a bit... busy.
 912             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 913                     r0.req.addr, ra, req_index, req_tag, req_row)
 914
 915         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 916         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 917
 918         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 919                                             cache_i_validdx, cache_tag_set,
 920                                             r0.req.addr,
 921                                             hit_set)
 922         comb += dc.tlb_hit.eq(tlb_hit)
 923         comb += dc.reload_tag.eq(r1.reload_tag)
 924         comb += dc.virt_mode.eq(r0.req.virt_mode)
 925         comb += dc.go.eq(go)
 926         comb += dc.req_index.eq(req_index)
 927
 928         comb += is_hit.eq(dc.is_hit)
 929         comb += hit_way.eq(dc.hit_way)
 930         comb += req_same_tag.eq(dc.rel_match)
 931
 932         # See if the request matches the line currently being reloaded
 933         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 934                   (req_index == r1.store_index) & req_same_tag):
 935             # For a store, consider this a hit even if the row isn't
 936             # valid since it will be by the time we perform the store.
 937             # For a load, check the appropriate row valid bit.
 938             rrow = Signal(ROW_LINE_BITS)
 939             comb += rrow.eq(req_row)
 940             valid = r1.rows_valid[rrow]
 941             comb += is_hit.eq((~r0.req.load) | valid)
 942             comb += hit_way.eq(replace_way)
 943
 944         # Whether to use forwarded data for a load or not
 945         with m.If((get_row(r1.req.real_addr) == req_row) &
 946                   (r1.req.hit_way == hit_way)):
 947             # Only need to consider r1.write_bram here, since if we
 948             # are writing refill data here, then we don't have a
 949             # cache hit this cycle on the line being refilled.
 950             # (There is the possibility that the load following the
 951             # load miss that started the refill could be to the old
 952             # contents of the victim line, since it is a couple of
 953             # cycles after the refill starts before we see the updated
 954             # cache tag. In that case we don't use the bypass.)
 955             comb += use_forward1_next.eq(r1.write_bram)
 956         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 957             comb += use_forward2_next.eq(r1.forward_valid1)
 958
 959         # The way that matched on a hit
 960         comb += req_hit_way.eq(hit_way)
 961
 962         # The way to replace on a miss
 963         with m.If(r1.write_tag):
 964             comb += replace_way.eq(plru_victim)
 965         with m.Else():
 966             comb += replace_way.eq(r1.store_way)
 967
 968         # work out whether we have permission for this access
 969         # NB we don't yet implement AMR, thus no KUAP
 970         comb += rc_ok.eq(perm_attr.reference
 971                          & (r0.req.load | perm_attr.changed))
 972         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 973                            (perm_attr.wr_perm |
 974                               (r0.req.load & perm_attr.rd_perm)))
 975         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 976
 977         # Combine the request and cache hit status to decide what
 978         # operation needs to be done
 979         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 980         comb += op.eq(Op.OP_NONE)
 981         with m.If(go):
 982             with m.If(~access_ok):
 983                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
 984                                  valid_ra, perm_ok, rc_ok)
 985                 comb += op.eq(Op.OP_BAD)
 986             with m.Elif(cancel_store):
 987                 m.d.sync += Display("DCACHE cancel store")
 988                 comb += op.eq(Op.OP_STCX_FAIL)
 989             with m.Else():
 990                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
 991                                  valid_ra, nc, r0.req.load)
 992                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 993                 with m.Switch(opsel):
 994                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 995                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 996                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 997                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 998                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 999                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1000                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1001                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1002         comb += req_op.eq(op)
1003         comb += req_go.eq(go)
1004
1005         # Version of the row number that is valid one cycle earlier
1006         # in the cases where we need to read the cache data BRAM.
1007         # If we're stalling then we need to keep reading the last
1008         # row requested.
1009         with m.If(~r0_stall):
1010             with m.If(m_in.valid):
1011                 comb += early_req_row.eq(get_row(m_in.addr))
1012             with m.Else():
1013                 comb += early_req_row.eq(get_row(d_in.addr))
1014         with m.Else():
1015             comb += early_req_row.eq(req_row)
1016
1017     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1018                          r0_valid, r0, reservation):
1019         """Handle load-with-reservation and store-conditional instructions
1020         """
1021         comb = m.d.comb
1022
1023         with m.If(r0_valid & r0.req.reserve):
1024             # XXX generate alignment interrupt if address
1025             # is not aligned XXX or if r0.req.nc = '1'
1026             with m.If(r0.req.load):
1027                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1028             with m.Else():
1029                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1030                 with m.If((~reservation.valid) |
1031                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1032                     comb += cancel_store.eq(1)
1033
1034     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1035                         reservation, r0):
1036         comb = m.d.comb
1037         sync = m.d.sync
1038
1039         with m.If(r0_valid & access_ok):
1040             with m.If(clear_rsrv):
1041                 sync += reservation.valid.eq(0)
1042             with m.Elif(set_rsrv):
1043                 sync += reservation.valid.eq(1)
1044                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1045
1046     def writeback_control(self, m, r1, cache_out_row):
1047         """Return data for loads & completion control logic
1048         """
1049         comb = m.d.comb
1050         sync = m.d.sync
1051         d_out, m_out = self.d_out, self.m_out
1052
1053         data_out = Signal(64)
1054         data_fwd = Signal(64)
1055
1056         # Use the bypass if are reading the row that was
1057         # written 1 or 2 cycles ago, including for the
1058         # slow_valid = 1 case (i.e. completing a load
1059         # miss or a non-cacheable load).
1060         with m.If(r1.use_forward1):
1061             comb += data_fwd.eq(r1.forward_data1)
1062         with m.Else():
1063             comb += data_fwd.eq(r1.forward_data2)
1064
1065         comb += data_out.eq(cache_out_row)
1066
1067         for i in range(8):
1068             with m.If(r1.forward_sel[i]):
1069                 dsel = data_fwd.word_select(i, 8)
1070                 comb += data_out.word_select(i, 8).eq(dsel)
1071
1072         # DCache output to LoadStore
1073         comb += d_out.valid.eq(r1.ls_valid)
1074         comb += d_out.data.eq(data_out)
1075         comb += d_out.store_done.eq(~r1.stcx_fail)
1076         comb += d_out.error.eq(r1.ls_error)
1077         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1078
1079         # Outputs to MMU
1080         comb += m_out.done.eq(r1.mmu_done)
1081         comb += m_out.err.eq(r1.mmu_error)
1082         comb += m_out.data.eq(data_out)
1083
1084         # We have a valid load or store hit or we just completed
1085         # a slow op such as a load miss, a NC load or a store
1086         #
1087         # Note: the load hit is delayed by one cycle. However it
1088         # can still not collide with r.slow_valid (well unless I
1089         # miscalculated) because slow_valid can only be set on a
1090         # subsequent request and not on its first cycle (the state
1091         # machine must have advanced), which makes slow_valid
1092         # at least 2 cycles from the previous hit_load_valid.
1093
1094         # Sanity: Only one of these must be set in any given cycle
1095
1096         if False: # TODO: need Display to get this to work
1097             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1098             "unexpected slow_valid collision with stcx_fail"
1099
1100             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1101              "unexpected hit_load_delayed collision with slow_valid"
1102
1103         with m.If(~r1.mmu_req):
1104             # Request came from loadstore1...
1105             # Load hit case is the standard path
1106             with m.If(r1.hit_load_valid):
1107                 sync += Display("completing load hit data=%x", data_out)
1108
1109             # error cases complete without stalling
1110             with m.If(r1.ls_error):
1111                 with m.If(r1.dcbz):
1112                     sync += Display("completing dcbz with error")
1113                 with m.Else():
1114                     sync += Display("completing ld/st with error")
1115
1116             # Slow ops (load miss, NC, stores)
1117             with m.If(r1.slow_valid):
1118                 sync += Display("completing store or load miss adr=%x data=%x",
1119                                 r1.req.real_addr, data_out)
1120
1121         with m.Else():
1122             # Request came from MMU
1123             with m.If(r1.hit_load_valid):
1124                 sync += Display("completing load hit to MMU, data=%x",
1125                                 m_out.data)
1126             # error cases complete without stalling
1127             with m.If(r1.mmu_error):
1128                 sync += Display("combpleting MMU ld with error")
1129
1130             # Slow ops (i.e. load miss)
1131             with m.If(r1.slow_valid):
1132                 sync += Display("completing MMU load miss, adr=%x data=%x",
1133                                 r1.req.real_addr, m_out.data)
1134
1135     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1136         """rams
1137         Generate a cache RAM for each way. This handles the normal
1138         reads, writes from reloads and the special store-hit update
1139         path as well.
1140
1141         Note: the BRAMs have an extra read buffer, meaning the output
1142         is pipelined an extra cycle. This differs from the
1143         icache. The writeback logic needs to take that into
1144         account by using 1-cycle delayed signals for load hits.
1145         """
1146         comb = m.d.comb
1147         bus = self.bus
1148
1149         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1150         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1151         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1152         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1153                    ~r1.write_bram))
1154         comb += rwe.i.eq(replace_way)
1155
1156         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1157         comb += hwe.i.eq(r1.hit_way)
1158
1159         # this one is gated with write_bram, and replace_way_e can never be
1160         # set at the same time.  that means that do_write can OR the outputs
1161         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1162         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1163         comb += hre.i.eq(r1.req.hit_way)
1164
1165         # common Signals
1166         do_read  = Signal()
1167         wr_addr  = Signal(ROW_BITS)
1168         wr_data  = Signal(WB_DATA_BITS)
1169         wr_sel   = Signal(ROW_SIZE)
1170         rd_addr  = Signal(ROW_BITS)
1171
1172         comb += do_read.eq(1) # always enable
1173         comb += rd_addr.eq(early_req_row)
1174
1175         # Write mux:
1176         #
1177         # Defaults to wishbone read responses (cache refill)
1178         #
1179         # For timing, the mux on wr_data/sel/addr is not
1180         # dependent on anything other than the current state.
1181
1182         with m.If(r1.write_bram):
1183             # Write store data to BRAM.  This happens one
1184             # cycle after the store is in r0.
1185             comb += wr_data.eq(r1.req.data)
1186             comb += wr_sel.eq(r1.req.byte_sel)
1187             comb += wr_addr.eq(get_row(r1.req.real_addr))
1188
1189         with m.Else():
1190             # Otherwise, we might be doing a reload or a DCBZ
1191             with m.If(r1.dcbz):
1192                 comb += wr_data.eq(0)
1193             with m.Else():
1194                 comb += wr_data.eq(bus.dat_r)
1195             comb += wr_addr.eq(r1.store_row)
1196             comb += wr_sel.eq(~0) # all 1s
1197
1198         # set up Cache Rams
1199         for i in range(NUM_WAYS):
1200             do_write = Signal(name="do_wr%d" % i)
1201             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1202             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1203
1204             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1205             setattr(m.submodules, "cacheram_%d" % i, way)
1206
1207             comb += way.rd_en.eq(do_read)
1208             comb += way.rd_addr.eq(rd_addr)
1209             comb += d_out.eq(way.rd_data_o)
1210             comb += way.wr_sel.eq(wr_sel_m)
1211             comb += way.wr_addr.eq(wr_addr)
1212             comb += way.wr_data.eq(wr_data)
1213
1214             # Cache hit reads
1215             with m.If(hwe.o[i]):
1216                 comb += cache_out_row.eq(d_out)
1217
1218             # these are mutually-exclusive via their Decoder-enablers
1219             # (note: Decoder-enable is inverted)
1220             comb += do_write.eq(hre.o[i] | rwe.o[i])
1221
1222             # Mask write selects with do_write since BRAM
1223             # doesn't have a global write-enable
1224             with m.If(do_write):
1225                 comb += wr_sel_m.eq(wr_sel)
1226
1227     # Cache hit synchronous machine for the easy case.
1228     # This handles load hits.
1229     # It also handles error cases (TLB miss, cache paradox)
1230     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1231                         req_hit_way, req_index, req_tag, access_ok,
1232                         tlb_hit, tlb_req_index):
1233         comb = m.d.comb
1234         sync = m.d.sync
1235
1236         with m.If(req_op != Op.OP_NONE):
1237             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1238                     req_op, r0.req.addr, r0.req.nc,
1239                     req_index, req_tag, req_hit_way)
1240
1241         with m.If(r0_valid):
1242             sync += r1.mmu_req.eq(r0.mmu_req)
1243
1244         # Fast path for load/store hits.
1245         # Set signals for the writeback controls.
1246         sync += r1.hit_way.eq(req_hit_way)
1247         sync += r1.hit_index.eq(req_index)
1248
1249         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1250         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1251                                 (req_op == Op.OP_STORE_HIT))
1252
1253         with m.If(req_op == Op.OP_BAD):
1254             sync += Display("Signalling ld/st error "
1255                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1256                             ~r0.mmu_req,r0.mmu_req,access_ok)
1257             sync += r1.ls_error.eq(~r0.mmu_req)
1258             sync += r1.mmu_error.eq(r0.mmu_req)
1259             sync += r1.cache_paradox.eq(access_ok)
1260         with m.Else():
1261             sync += r1.ls_error.eq(0)
1262             sync += r1.mmu_error.eq(0)
1263             sync += r1.cache_paradox.eq(0)
1264
1265         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1266
1267         # Record TLB hit information for updating TLB PLRU
1268         sync += r1.tlb_hit.eq(tlb_hit)
1269         sync += r1.tlb_hit_index.eq(tlb_req_index)
1270
1271     # Memory accesses are handled by this state machine:
1272     #
1273     #   * Cache load miss/reload (in conjunction with "rams")
1274     #   * Load hits for non-cachable forms
1275     #   * Stores (the collision case is handled in "rams")
1276     #
1277     # All wishbone requests generation is done here.
1278     # This machine operates at stage 1.
1279     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1280                     r0, replace_way,
1281                     req_hit_way, req_same_tag,
1282                     r0_valid, req_op, cache_tags, req_go, ra):
1283
1284         comb = m.d.comb
1285         sync = m.d.sync
1286         bus = self.bus
1287         d_in = self.d_in
1288
1289         req         = MemAccessRequest("mreq_ds")
1290
1291         req_row = Signal(ROW_BITS)
1292         req_idx = Signal(INDEX_BITS)
1293         req_tag = Signal(TAG_BITS)
1294         comb += req_idx.eq(get_index(req.real_addr))
1295         comb += req_row.eq(get_row(req.real_addr))
1296         comb += req_tag.eq(get_tag(req.real_addr))
1297
1298         sync += r1.use_forward1.eq(use_forward1_next)
1299         sync += r1.forward_sel.eq(0)
1300
1301         with m.If(use_forward1_next):
1302             sync += r1.forward_sel.eq(r1.req.byte_sel)
1303         with m.Elif(use_forward2_next):
1304             sync += r1.forward_sel.eq(r1.forward_sel1)
1305
1306         sync += r1.forward_data2.eq(r1.forward_data1)
1307         with m.If(r1.write_bram):
1308             sync += r1.forward_data1.eq(r1.req.data)
1309             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1310             sync += r1.forward_way1.eq(r1.req.hit_way)
1311             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1312             sync += r1.forward_valid1.eq(1)
1313         with m.Else():
1314             with m.If(r1.dcbz):
1315                 sync += r1.forward_data1.eq(0)
1316             with m.Else():
1317                 sync += r1.forward_data1.eq(bus.dat_r)
1318             sync += r1.forward_sel1.eq(~0) # all 1s
1319             sync += r1.forward_way1.eq(replace_way)
1320             sync += r1.forward_row1.eq(r1.store_row)
1321             sync += r1.forward_valid1.eq(0)
1322
1323         # One cycle pulses reset
1324         sync += r1.slow_valid.eq(0)
1325         sync += r1.write_bram.eq(0)
1326         sync += r1.inc_acks.eq(0)
1327         sync += r1.dec_acks.eq(0)
1328
1329         sync += r1.ls_valid.eq(0)
1330         # complete tlbies and TLB loads in the third cycle
1331         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1332
1333         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1334             with m.If(~r0.mmu_req):
1335                 sync += r1.ls_valid.eq(1)
1336             with m.Else():
1337                 sync += r1.mmu_done.eq(1)
1338
1339         with m.If(r1.write_tag):
1340             # Store new tag in selected way
1341             replace_way_onehot = Signal(NUM_WAYS)
1342             comb += replace_way_onehot.eq(1<<replace_way)
1343             for i in range(NUM_WAYS):
1344                 with m.If(replace_way_onehot[i]):
1345                     ct = Signal(TAG_RAM_WIDTH)
1346                     comb += ct.eq(cache_tags[r1.store_index].tag)
1347                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1348                     sync += cache_tags[r1.store_index].tag.eq(ct)
1349             sync += r1.store_way.eq(replace_way)
1350             sync += r1.write_tag.eq(0)
1351
1352         # Take request from r1.req if there is one there,
1353         # else from req_op, ra, etc.
1354         with m.If(r1.full):
1355             comb += req.eq(r1.req)
1356         with m.Else():
1357             comb += req.op.eq(req_op)
1358             comb += req.valid.eq(req_go)
1359             comb += req.mmu_req.eq(r0.mmu_req)
1360             comb += req.dcbz.eq(r0.req.dcbz)
1361             comb += req.real_addr.eq(ra)
1362
1363             with m.If(r0.req.dcbz):
1364                 # force data to 0 for dcbz
1365                 comb += req.data.eq(0)
1366             with m.Elif(r0.d_valid):
1367                 comb += req.data.eq(r0.req.data)
1368             with m.Else():
1369                 comb += req.data.eq(d_in.data)
1370
1371             # Select all bytes for dcbz
1372             # and for cacheable loads
1373             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1374                 comb += req.byte_sel.eq(~0) # all 1s
1375             with m.Else():
1376                 comb += req.byte_sel.eq(r0.req.byte_sel)
1377             comb += req.hit_way.eq(req_hit_way)
1378             comb += req.same_tag.eq(req_same_tag)
1379
1380             # Store the incoming request from r0,
1381             # if it is a slow request
1382             # Note that r1.full = 1 implies req_op = OP_NONE
1383             with m.If((req_op == Op.OP_LOAD_MISS)
1384                       | (req_op == Op.OP_LOAD_NC)
1385                       | (req_op == Op.OP_STORE_MISS)
1386                       | (req_op == Op.OP_STORE_HIT)):
1387                 sync += r1.req.eq(req)
1388                 sync += r1.full.eq(1)
1389
1390         # Main state machine
1391         with m.Switch(r1.state):
1392
1393             with m.Case(State.IDLE):
1394                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1395                 sync += r1.wb.sel.eq(req.byte_sel)
1396                 sync += r1.wb.dat.eq(req.data)
1397                 sync += r1.dcbz.eq(req.dcbz)
1398
1399                 # Keep track of our index and way
1400                 # for subsequent stores.
1401                 sync += r1.store_index.eq(req_idx)
1402                 sync += r1.store_row.eq(req_row)
1403                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1404                 sync += r1.reload_tag.eq(req_tag)
1405                 sync += r1.req.same_tag.eq(1)
1406
1407                 with m.If(req.op == Op.OP_STORE_HIT):
1408                     sync += r1.store_way.eq(req.hit_way)
1409
1410                 # Reset per-row valid bits,
1411                 # ready for handling OP_LOAD_MISS
1412                 for i in range(ROW_PER_LINE):
1413                     sync += r1.rows_valid[i].eq(0)
1414
1415                 with m.If(req_op != Op.OP_NONE):
1416                     sync += Display("cache op %d", req.op)
1417
1418                 with m.Switch(req.op):
1419                     with m.Case(Op.OP_LOAD_HIT):
1420                         # stay in IDLE state
1421                         pass
1422
1423                     with m.Case(Op.OP_LOAD_MISS):
1424                         sync += Display("cache miss real addr: %x " \
1425                                 "idx: %x tag: %x",
1426                                 req.real_addr, req_row, req_tag)
1427
1428                         # Start the wishbone cycle
1429                         sync += r1.wb.we.eq(0)
1430                         sync += r1.wb.cyc.eq(1)
1431                         sync += r1.wb.stb.eq(1)
1432
1433                         # Track that we had one request sent
1434                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1435                         sync += r1.write_tag.eq(1)
1436
1437                     with m.Case(Op.OP_LOAD_NC):
1438                         sync += r1.wb.cyc.eq(1)
1439                         sync += r1.wb.stb.eq(1)
1440                         sync += r1.wb.we.eq(0)
1441                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1442
1443                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1444                         with m.If(~req.dcbz):
1445                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1446                             sync += r1.acks_pending.eq(1)
1447                             sync += r1.full.eq(0)
1448                             sync += r1.slow_valid.eq(1)
1449
1450                             with m.If(~req.mmu_req):
1451                                 sync += r1.ls_valid.eq(1)
1452                             with m.Else():
1453                                 sync += r1.mmu_done.eq(1)
1454
1455                             with m.If(req.op == Op.OP_STORE_HIT):
1456                                 sync += r1.write_bram.eq(1)
1457                         with m.Else():
1458                             # dcbz is handled much like a load miss except
1459                             # that we are writing to memory instead of reading
1460                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1461
1462                             with m.If(req.op == Op.OP_STORE_MISS):
1463                                 sync += r1.write_tag.eq(1)
1464
1465                         sync += r1.wb.we.eq(1)
1466                         sync += r1.wb.cyc.eq(1)
1467                         sync += r1.wb.stb.eq(1)
1468
1469                     # OP_NONE and OP_BAD do nothing
1470                     # OP_BAD & OP_STCX_FAIL were
1471                     # handled above already
1472                     with m.Case(Op.OP_NONE):
1473                         pass
1474                     with m.Case(Op.OP_BAD):
1475                         pass
1476                     with m.Case(Op.OP_STCX_FAIL):
1477                         pass
1478
1479             with m.Case(State.RELOAD_WAIT_ACK):
1480                 ld_stbs_done = Signal()
1481                 # Requests are all sent if stb is 0
1482                 comb += ld_stbs_done.eq(~r1.wb.stb)
1483
1484                 # If we are still sending requests, was one accepted?
1485                 with m.If((~bus.stall) & r1.wb.stb):
1486                     # That was the last word?  We are done sending.
1487                     # Clear stb and set ld_stbs_done so we can handle an
1488                     # eventual last ack on the same cycle.
1489                     # sigh - reconstruct wb adr with 3 extra 0s at front
1490                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1491                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1492                         sync += r1.wb.stb.eq(0)
1493                         comb += ld_stbs_done.eq(1)
1494
1495                     # Calculate the next row address in the current cache line
1496                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1497                     comb += row.eq(r1.wb.adr)
1498                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1499
1500                 # Incoming acks processing
1501                 sync += r1.forward_valid1.eq(bus.ack)
1502                 with m.If(bus.ack):
1503                     srow = Signal(ROW_LINE_BITS)
1504                     comb += srow.eq(r1.store_row)
1505                     sync += r1.rows_valid[srow].eq(1)
1506
1507                     # If this is the data we were looking for,
1508                     # we can complete the request next cycle.
1509                     # Compare the whole address in case the
1510                     # request in r1.req is not the one that
1511                     # started this refill.
1512                     with m.If(req.valid & r1.req.same_tag &
1513                               ((r1.dcbz & r1.req.dcbz) |
1514                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1515                                 (r1.store_row == get_row(req.real_addr))):
1516                         sync += r1.full.eq(0)
1517                         sync += r1.slow_valid.eq(1)
1518                         with m.If(~r1.mmu_req):
1519                             sync += r1.ls_valid.eq(1)
1520                         with m.Else():
1521                             sync += r1.mmu_done.eq(1)
1522                         sync += r1.forward_sel.eq(~0) # all 1s
1523                         sync += r1.use_forward1.eq(1)
1524
1525                     # Check for completion
1526                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1527                                                       r1.end_row_ix)):
1528                         # Complete wishbone cycle
1529                         sync += r1.wb.cyc.eq(0)
1530
1531                         # Cache line is now valid
1532                         cv = Signal(INDEX_BITS)
1533                         comb += cv.eq(cache_tags[r1.store_index].valid)
1534                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1535                         sync += cache_tags[r1.store_index].valid.eq(cv)
1536
1537                         sync += r1.state.eq(State.IDLE)
1538                         sync += Display("cache valid set %x "
1539                                         "idx %d way %d",
1540                                          cv, r1.store_index, r1.store_way)
1541
1542                     # Increment store row counter
1543                     sync += r1.store_row.eq(next_row(r1.store_row))
1544
1545             with m.Case(State.STORE_WAIT_ACK):
1546                 st_stbs_done = Signal()
1547                 acks        = Signal(3)
1548                 adjust_acks = Signal(3)
1549
1550                 comb += st_stbs_done.eq(~r1.wb.stb)
1551                 comb += acks.eq(r1.acks_pending)
1552
1553                 with m.If(r1.inc_acks != r1.dec_acks):
1554                     with m.If(r1.inc_acks):
1555                         comb += adjust_acks.eq(acks + 1)
1556                     with m.Else():
1557                         comb += adjust_acks.eq(acks - 1)
1558                 with m.Else():
1559                     comb += adjust_acks.eq(acks)
1560
1561                 sync += r1.acks_pending.eq(adjust_acks)
1562
1563                 # Clear stb when slave accepted request
1564                 with m.If(~bus.stall):
1565                     # See if there is another store waiting
1566                     # to be done which is in the same real page.
1567                     with m.If(req.valid):
1568                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1569                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1570                         sync += r1.wb.dat.eq(req.data)
1571                         sync += r1.wb.sel.eq(req.byte_sel)
1572
1573                     with m.If((adjust_acks < 7) & req.same_tag &
1574                                 ((req.op == Op.OP_STORE_MISS)
1575                                  | (req.op == Op.OP_STORE_HIT))):
1576                         sync += r1.wb.stb.eq(1)
1577                         comb += st_stbs_done.eq(0)
1578
1579                         with m.If(req.op == Op.OP_STORE_HIT):
1580                             sync += r1.write_bram.eq(1)
1581                         sync += r1.full.eq(0)
1582                         sync += r1.slow_valid.eq(1)
1583
1584                         # Store requests never come from the MMU
1585                         sync += r1.ls_valid.eq(1)
1586                         comb += st_stbs_done.eq(0)
1587                         sync += r1.inc_acks.eq(1)
1588                     with m.Else():
1589                         sync += r1.wb.stb.eq(0)
1590                         comb += st_stbs_done.eq(1)
1591
1592                 # Got ack ? See if complete.
1593                 with m.If(bus.ack):
1594                     with m.If(st_stbs_done & (adjust_acks == 1)):
1595                         sync += r1.state.eq(State.IDLE)
1596                         sync += r1.wb.cyc.eq(0)
1597                         sync += r1.wb.stb.eq(0)
1598                     sync += r1.dec_acks.eq(1)
1599
1600             with m.Case(State.NC_LOAD_WAIT_ACK):
1601                 # Clear stb when slave accepted request
1602                 with m.If(~bus.stall):
1603                     sync += r1.wb.stb.eq(0)
1604
1605                 # Got ack ? complete.
1606                 with m.If(bus.ack):
1607                     sync += r1.state.eq(State.IDLE)
1608                     sync += r1.full.eq(0)
1609                     sync += r1.slow_valid.eq(1)
1610
1611                     with m.If(~r1.mmu_req):
1612                         sync += r1.ls_valid.eq(1)
1613                     with m.Else():
1614                         sync += r1.mmu_done.eq(1)
1615
1616                     sync += r1.forward_sel.eq(~0) # all 1s
1617                     sync += r1.use_forward1.eq(1)
1618                     sync += r1.wb.cyc.eq(0)
1619                     sync += r1.wb.stb.eq(0)
1620
1621     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1622
1623         sync = m.d.sync
1624         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1625
1626         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1627                                stall_out, req_op[:3], d_out.valid, d_out.error,
1628                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1629                                r1.real_adr[3:6]))
1630
1631     def elaborate(self, platform):
1632
1633         m = Module()
1634         comb = m.d.comb
1635         d_in = self.d_in
1636
1637         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1638         cache_tags       = CacheTagArray()
1639         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1640
1641         # TODO attribute ram_style : string;
1642         # TODO attribute ram_style of cache_tags : signal is "distributed";
1643
1644         """note: these are passed to nmigen.hdl.Memory as "attributes".
1645            don't know how, just that they are.
1646         """
1647         # TODO attribute ram_style of
1648         #  dtlb_tags : signal is "distributed";
1649         # TODO attribute ram_style of
1650         #  dtlb_ptes : signal is "distributed";
1651
1652         r0      = RegStage0("r0")
1653         r0_full = Signal()
1654
1655         r1 = RegStage1("r1")
1656
1657         reservation = Reservation()
1658
1659         # Async signals on incoming request
1660         req_index    = Signal(INDEX_BITS)
1661         req_row      = Signal(ROW_BITS)
1662         req_hit_way  = Signal(WAY_BITS)
1663         req_tag      = Signal(TAG_BITS)
1664         req_op       = Signal(Op)
1665         req_data     = Signal(64)
1666         req_same_tag = Signal()
1667         req_go       = Signal()
1668
1669         early_req_row     = Signal(ROW_BITS)
1670
1671         cancel_store      = Signal()
1672         set_rsrv          = Signal()
1673         clear_rsrv        = Signal()
1674
1675         r0_valid          = Signal()
1676         r0_stall          = Signal()
1677
1678         use_forward1_next = Signal()
1679         use_forward2_next = Signal()
1680
1681         cache_out_row     = Signal(WB_DATA_BITS)
1682
1683         plru_victim       = Signal(WAY_BITS)
1684         replace_way       = Signal(WAY_BITS)
1685
1686         # Wishbone read/write/cache write formatting signals
1687         bus_sel           = Signal(8)
1688
1689         # TLB signals
1690         tlb_way       = TLBRecord("tlb_way")
1691         tlb_req_index = Signal(TLB_SET_BITS)
1692         tlb_hit       = TLBHit("tlb_hit")
1693         pte           = Signal(TLB_PTE_BITS)
1694         ra            = Signal(REAL_ADDR_BITS)
1695         valid_ra      = Signal()
1696         perm_attr     = PermAttr("dc_perms")
1697         rc_ok         = Signal()
1698         perm_ok       = Signal()
1699         access_ok     = Signal()
1700
1701         tlb_plru_victim = Signal(TLB_WAY_BITS)
1702
1703         # we don't yet handle collisions between loadstore1 requests
1704         # and MMU requests
1705         comb += self.m_out.stall.eq(0)
1706
1707         # Hold off the request in r0 when r1 has an uncompleted request
1708         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1709         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1710         comb += self.stall_out.eq(r0_stall)
1711
1712         # deal with litex not doing wishbone pipeline mode
1713         # XXX in wrong way.  FIFOs are needed in the SRAM test
1714         # so that stb/ack match up. same thing done in icache.py
1715         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1716
1717         # Wire up wishbone request latch out of stage 1
1718         comb += self.bus.we.eq(r1.wb.we)
1719         comb += self.bus.adr.eq(r1.wb.adr)
1720         comb += self.bus.sel.eq(r1.wb.sel)
1721         comb += self.bus.stb.eq(r1.wb.stb)
1722         comb += self.bus.dat_w.eq(r1.wb.dat)
1723         comb += self.bus.cyc.eq(r1.wb.cyc)
1724
1725         # create submodule TLBUpdate
1726         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1727         dtlb = self.dtlb_update.dtlb
1728
1729         # call sub-functions putting everything together, using shared
1730         # signals established above
1731         self.stage_0(m, r0, r1, r0_full)
1732         self.tlb_read(m, r0_stall, tlb_way, dtlb)
1733         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1734                         tlb_way,
1735                         pte, tlb_hit, valid_ra, perm_attr, ra)
1736         self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
1737                         tlb_hit, tlb_plru_victim,
1738                         tlb_way)
1739         self.maybe_plrus(m, r1, plru_victim)
1740         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1741         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1742         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1743                            r0_valid, r1, cache_tags, replace_way,
1744                            use_forward1_next, use_forward2_next,
1745                            req_hit_way, plru_victim, rc_ok, perm_attr,
1746                            valid_ra, perm_ok, access_ok, req_op, req_go,
1747                            tlb_hit, tlb_way, cache_tag_set,
1748                            cancel_store, req_same_tag, r0_stall, early_req_row)
1749         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1750                            r0_valid, r0, reservation)
1751         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1752                            reservation, r0)
1753         self.writeback_control(m, r1, cache_out_row)
1754         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1755         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1756                         req_hit_way, req_index, req_tag, access_ok,
1757                         tlb_hit, tlb_req_index)
1758         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1759                     r0, replace_way,
1760                     req_hit_way, req_same_tag,
1761                          r0_valid, req_op, cache_tags, req_go, ra)
1762         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1763
1764         return m
1765
1766
1767 if __name__ == '__main__':
1768     dut = DCache()
1769     vl = rtlil.convert(dut, ports=[])
1770     with open("test_dcache.il", "w") as f:
1771         f.write(vl)