src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record)
  29 from nmutil.util import Display
  30 from nmigen.lib.coding import Decoder
  31
  32 from copy import deepcopy
  33 from random import randint, seed
  34
  35 from nmigen_soc.wishbone.bus import Interface
  36
  37 from nmigen.cli import main
  38 from nmutil.iocontrol import RecordObject
  39 from nmigen.utils import log2_int
  40 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  41                                      DCacheToLoadStore1Type,
  42                                      MMUToDCacheType,
  43                                      DCacheToMMUType)
  44
  45 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  46                                 WBAddrType, WBDataType, WBSelType,
  47                                 WBMasterOut, WBSlaveOut,
  48                                 WBMasterOutVector, WBSlaveOutVector,
  49                                 WBIOMasterOut, WBIOSlaveOut)
  50
  51 from soc.experiment.cache_ram import CacheRam
  52 #from soc.experiment.plru import PLRU
  53 from nmutil.plru import PLRU, PLRUs
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmigen.cli import rtlil
  59
  60 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  61 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  62 from nmutil.sim_tmp_alternative import Simulator
  63
  64 from nmutil.util import wrap
  65
  66
  67 # TODO: make these parameters of DCache at some point
  68 LINE_SIZE = 64    # Line size in bytes
  69 NUM_LINES = 16    # Number of lines in a set
  70 NUM_WAYS = 4      # Number of ways
  71 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  72 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  73 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  74 LOG_LENGTH = 0    # Non-zero to enable log data collection
  75
  76 # BRAM organisation: We never access more than
  77 #     -- WB_DATA_BITS at a time so to save
  78 #     -- resources we make the array only that wide, and
  79 #     -- use consecutive indices to make a cache "line"
  80 #     --
  81 #     -- ROW_SIZE is the width in bytes of the BRAM
  82 #     -- (based on WB, so 64-bits)
  83 ROW_SIZE = WB_DATA_BITS // 8;
  84
  85 # ROW_PER_LINE is the number of row (wishbone
  86 # transactions) in a line
  87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  88
  89 # BRAM_ROWS is the number of rows in BRAM needed
  90 # to represent the full dcache
  91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  92
  93 print ("ROW_SIZE", ROW_SIZE)
  94 print ("ROW_PER_LINE", ROW_PER_LINE)
  95 print ("BRAM_ROWS", BRAM_ROWS)
  96 print ("NUM_WAYS", NUM_WAYS)
  97
  98 # Bit fields counts in the address
  99
 100 # REAL_ADDR_BITS is the number of real address
 101 # bits that we store
 102 REAL_ADDR_BITS = 56
 103
 104 # ROW_BITS is the number of bits to select a row
 105 ROW_BITS = log2_int(BRAM_ROWS)
 106
 107 # ROW_LINE_BITS is the number of bits to select
 108 # a row within a line
 109 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 110
 111 # LINE_OFF_BITS is the number of bits for
 112 # the offset in a cache line
 113 LINE_OFF_BITS = log2_int(LINE_SIZE)
 114
 115 # ROW_OFF_BITS is the number of bits for
 116 # the offset in a row
 117 ROW_OFF_BITS = log2_int(ROW_SIZE)
 118
 119 # INDEX_BITS is the number if bits to
 120 # select a cache line
 121 INDEX_BITS = log2_int(NUM_LINES)
 122
 123 # SET_SIZE_BITS is the log base 2 of the set size
 124 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 125
 126 # TAG_BITS is the number of bits of
 127 # the tag part of the address
 128 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 129
 130 # TAG_WIDTH is the width in bits of each way of the tag RAM
 131 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 132
 133 # WAY_BITS is the number of bits to select a way
 134 WAY_BITS = log2_int(NUM_WAYS)
 135
 136 # Example of layout for 32 lines of 64 bytes:
 137 layout = """\
 138   ..  tag    |index|  line  |
 139   ..         |   row   |    |
 140   ..         |     |---|    | ROW_LINE_BITS  (3)
 141   ..         |     |--- - --| LINE_OFF_BITS (6)
 142   ..         |         |- --| ROW_OFF_BITS  (3)
 143   ..         |----- ---|    | ROW_BITS      (8)
 144   ..         |-----|        | INDEX_BITS    (5)
 145   .. --------|              | TAG_BITS      (45)
 146 """
 147 print (layout)
 148 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 149             (TAG_BITS, INDEX_BITS, ROW_BITS,
 150              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 151 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 152 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 153 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 154
 155 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 156
 157 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 158
 159 def CacheTagArray():
 160     tag_layout = [('valid', 1),
 161                   ('tag', TAG_RAM_WIDTH),
 162                  ]
 163     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 164
 165 def RowPerLineValidArray():
 166     return Array(Signal(name="rows_valid%d" % x) \
 167                         for x in range(ROW_PER_LINE))
 168
 169 # L1 TLB
 170 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 171 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 172 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 173 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 174 TLB_PTE_BITS     = 64
 175 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 176
 177 def ispow2(x):
 178     return (1<<log2_int(x, False)) == x
 179
 180 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 181 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 182 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 183 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 184 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 185 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 186         "geometry bits don't add up"
 187 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 188         "geometry bits don't add up"
 189 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 190          "geometry bits don't add up"
 191 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 192 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 193
 194 def TLBHit(name):
 195     return Record([('valid', 1),
 196                    ('way', TLB_WAY_BITS)], name=name)
 197
 198 def TLBTagEAArray():
 199     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 200                 for x in range (TLB_NUM_WAYS))
 201
 202 def TLBRecord(name):
 203     tlb_layout = [('valid', TLB_NUM_WAYS),
 204                   ('tag', TLB_TAG_WAY_BITS),
 205                   ('pte', TLB_PTE_WAY_BITS)
 206                  ]
 207     return Record(tlb_layout, name=name)
 208
 209 def TLBArray():
 210     return Array(TLBRecord(name="tlb%d" % x) for x in range(TLB_SET_SIZE))
 211
 212 def HitWaySet():
 213     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 214                         for x in range(TLB_NUM_WAYS))
 215
 216 # Cache RAM interface
 217 def CacheRamOut():
 218     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 219                  for x in range(NUM_WAYS))
 220
 221 # PLRU output interface
 222 def PLRUOut():
 223     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 224                 for x in range(NUM_LINES))
 225
 226 # TLB PLRU output interface
 227 def TLBPLRUOut():
 228     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 229                 for x in range(TLB_SET_SIZE))
 230
 231 # Helper functions to decode incoming requests
 232 #
 233 # Return the cache line index (tag index) for an address
 234 def get_index(addr):
 235     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 236
 237 # Return the cache row index (data memory) for an address
 238 def get_row(addr):
 239     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 240
 241 # Return the index of a row within a line
 242 def get_row_of_line(row):
 243     return row[:ROW_BITS][:ROW_LINE_BITS]
 244
 245 # Returns whether this is the last row of a line
 246 def is_last_row_addr(addr, last):
 247     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 248
 249 # Returns whether this is the last row of a line
 250 def is_last_row(row, last):
 251     return get_row_of_line(row) == last
 252
 253 # Return the next row in the current cache line. We use a
 254 # dedicated function in order to limit the size of the
 255 # generated adder to be only the bits within a cache line
 256 # (3 bits with default settings)
 257 def next_row(row):
 258     row_v = row[0:ROW_LINE_BITS] + 1
 259     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 260
 261 # Get the tag value from the address
 262 def get_tag(addr):
 263     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 264
 265 # Read a tag from a tag memory row
 266 def read_tag(way, tagset):
 267     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 268
 269 # Read a TLB tag from a TLB tag memory row
 270 def read_tlb_tag(way, tags):
 271     return tags.word_select(way, TLB_EA_TAG_BITS)
 272
 273 # Write a TLB tag to a TLB tag memory row
 274 def write_tlb_tag(way, tags, tag):
 275     return read_tlb_tag(way, tags).eq(tag)
 276
 277 # Read a PTE from a TLB PTE memory row
 278 def read_tlb_pte(way, ptes):
 279     return ptes.word_select(way, TLB_PTE_BITS)
 280
 281 def write_tlb_pte(way, ptes, newpte):
 282     return read_tlb_pte(way, ptes).eq(newpte)
 283
 284
 285 # Record for storing permission, attribute, etc. bits from a PTE
 286 class PermAttr(RecordObject):
 287     def __init__(self, name=None):
 288         super().__init__(name=name)
 289         self.reference = Signal()
 290         self.changed   = Signal()
 291         self.nocache   = Signal()
 292         self.priv      = Signal()
 293         self.rd_perm   = Signal()
 294         self.wr_perm   = Signal()
 295
 296
 297 def extract_perm_attr(pte):
 298     pa = PermAttr()
 299     return pa;
 300
 301
 302 # Type of operation on a "valid" input
 303 @unique
 304 class Op(Enum):
 305     OP_NONE       = 0
 306     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 307     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 308     OP_LOAD_HIT   = 3 # Cache hit on load
 309     OP_LOAD_MISS  = 4 # Load missing cache
 310     OP_LOAD_NC    = 5 # Non-cachable load
 311     OP_STORE_HIT  = 6 # Store hitting cache
 312     OP_STORE_MISS = 7 # Store missing cache
 313
 314
 315 # Cache state machine
 316 @unique
 317 class State(Enum):
 318     IDLE             = 0 # Normal load hit processing
 319     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 320     STORE_WAIT_ACK   = 2 # Store wait ack
 321     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 322
 323
 324 # Dcache operations:
 325 #
 326 # In order to make timing, we use the BRAMs with
 327 # an output buffer, which means that the BRAM
 328 # output is delayed by an extra cycle.
 329 #
 330 # Thus, the dcache has a 2-stage internal pipeline
 331 # for cache hits with no stalls.
 332 #
 333 # All other operations are handled via stalling
 334 # in the first stage.
 335 #
 336 # The second stage can thus complete a hit at the same
 337 # time as the first stage emits a stall for a complex op.
 338 #
 339 # Stage 0 register, basically contains just the latched request
 340
 341 class RegStage0(RecordObject):
 342     def __init__(self, name=None):
 343         super().__init__(name=name)
 344         self.req     = LoadStore1ToDCacheType(name="lsmem")
 345         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 346         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 347         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 348         self.mmu_req = Signal() # indicates source of request
 349         self.d_valid = Signal() # indicates req.data is valid now
 350
 351
 352 class MemAccessRequest(RecordObject):
 353     def __init__(self, name=None):
 354         super().__init__(name=name)
 355         self.op        = Signal(Op)
 356         self.valid     = Signal()
 357         self.dcbz      = Signal()
 358         self.real_addr = Signal(REAL_ADDR_BITS)
 359         self.data      = Signal(64)
 360         self.byte_sel  = Signal(8)
 361         self.hit_way   = Signal(WAY_BITS)
 362         self.same_tag  = Signal()
 363         self.mmu_req   = Signal()
 364
 365
 366 # First stage register, contains state for stage 1 of load hits
 367 # and for the state machine used by all other operations
 368 class RegStage1(RecordObject):
 369     def __init__(self, name=None):
 370         super().__init__(name=name)
 371         # Info about the request
 372         self.full             = Signal() # have uncompleted request
 373         self.mmu_req          = Signal() # request is from MMU
 374         self.req              = MemAccessRequest(name="reqmem")
 375
 376         # Cache hit state
 377         self.hit_way          = Signal(WAY_BITS)
 378         self.hit_load_valid   = Signal()
 379         self.hit_index        = Signal(INDEX_BITS)
 380         self.cache_hit        = Signal()
 381
 382         # TLB hit state
 383         self.tlb_hit          = TLBHit("tlb_hit")
 384         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 385
 386         # 2-stage data buffer for data forwarded from writes to reads
 387         self.forward_data1    = Signal(64)
 388         self.forward_data2    = Signal(64)
 389         self.forward_sel1     = Signal(8)
 390         self.forward_valid1   = Signal()
 391         self.forward_way1     = Signal(WAY_BITS)
 392         self.forward_row1     = Signal(ROW_BITS)
 393         self.use_forward1     = Signal()
 394         self.forward_sel      = Signal(8)
 395
 396         # Cache miss state (reload state machine)
 397         self.state            = Signal(State)
 398         self.dcbz             = Signal()
 399         self.write_bram       = Signal()
 400         self.write_tag        = Signal()
 401         self.slow_valid       = Signal()
 402         self.wb               = WBMasterOut("wb")
 403         self.reload_tag       = Signal(TAG_BITS)
 404         self.store_way        = Signal(WAY_BITS)
 405         self.store_row        = Signal(ROW_BITS)
 406         self.store_index      = Signal(INDEX_BITS)
 407         self.end_row_ix       = Signal(ROW_LINE_BITS)
 408         self.rows_valid       = RowPerLineValidArray()
 409         self.acks_pending     = Signal(3)
 410         self.inc_acks         = Signal()
 411         self.dec_acks         = Signal()
 412
 413         # Signals to complete (possibly with error)
 414         self.ls_valid         = Signal()
 415         self.ls_error         = Signal()
 416         self.mmu_done         = Signal()
 417         self.mmu_error        = Signal()
 418         self.cache_paradox    = Signal()
 419
 420         # Signal to complete a failed stcx.
 421         self.stcx_fail        = Signal()
 422
 423
 424 # Reservation information
 425 class Reservation(RecordObject):
 426     def __init__(self):
 427         super().__init__()
 428         self.valid = Signal()
 429         self.addr  = Signal(64-LINE_OFF_BITS)
 430
 431
 432 class DTLBUpdate(Elaboratable):
 433     def __init__(self):
 434         self.dtlb     = TLBArray()
 435         self.tlbie    = Signal()
 436         self.tlbwe    = Signal()
 437         self.doall    = Signal()
 438         self.tlb_hit     = TLBHit("tlb_hit")
 439         self.tlb_req_index = Signal(TLB_SET_BITS)
 440
 441         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 442         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 443         self.repl_way        = Signal(TLB_WAY_BITS)
 444         self.eatag           = Signal(TLB_EA_TAG_BITS)
 445         self.pte_data        = Signal(TLB_PTE_BITS)
 446
 447         # read from dtlb array
 448         self.tlb_read       = Signal()
 449         self.tlb_read_index = Signal(TLB_SET_BITS)
 450         self.tlb_way        = TLBRecord("o_tlb_way")
 451
 452     def elaborate(self, platform):
 453         m = Module()
 454         comb = m.d.comb
 455         sync = m.d.sync
 456
 457         tagset   = Signal(TLB_TAG_WAY_BITS)
 458         pteset   = Signal(TLB_PTE_WAY_BITS)
 459         updated  = Signal()
 460         v_updated  = Signal()
 461         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 462         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 463         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 464         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 465
 466         dtlb, tlb_req_index = self.dtlb, self.tlb_req_index
 467         comb += dv.eq(dtlb[tlb_req_index].valid)
 468         comb += db_out.eq(dv)
 469
 470         with m.If(self.tlbie & self.doall):
 471             # clear all valid bits at once
 472             for i in range(TLB_SET_SIZE):
 473                 sync += dtlb[i].valid.eq(0)
 474         with m.Elif(self.tlbie):
 475             # invalidate just the hit_way
 476             with m.If(self.tlb_hit.valid):
 477                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 478                 comb += v_updated.eq(1)
 479         with m.Elif(self.tlbwe):
 480             # write to tge rrquested tag and PTE
 481             comb += tagset.eq(self.tlb_tag_way)
 482             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 483             comb += tb_out.eq(tagset)
 484
 485             comb += pteset.eq(self.tlb_pte_way)
 486             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 487             comb += pb_out.eq(pteset)
 488
 489             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 490
 491             comb += updated.eq(1)
 492             comb += v_updated.eq(1)
 493
 494         with m.If(updated):
 495             sync += dtlb[tlb_req_index].tag.eq(tb_out)
 496             sync += dtlb[tlb_req_index].pte.eq(pb_out)
 497         with m.If(v_updated):
 498             sync += dtlb[tlb_req_index].valid.eq(db_out)
 499
 500         # select one TLB way
 501         with m.If(self.tlb_read):
 502             sync += self.tlb_way.eq(dtlb[self.tlb_read_index])
 503
 504         return m
 505
 506
 507 class DCachePendingHit(Elaboratable):
 508
 509     def __init__(self, tlb_way,
 510                       cache_i_validdx, cache_tag_set,
 511                     req_addr,
 512                     hit_set):
 513
 514         self.go          = Signal()
 515         self.virt_mode   = Signal()
 516         self.is_hit      = Signal()
 517         self.tlb_hit      = TLBHit("tlb_hit")
 518         self.hit_way     = Signal(WAY_BITS)
 519         self.rel_match   = Signal()
 520         self.req_index   = Signal(INDEX_BITS)
 521         self.reload_tag  = Signal(TAG_BITS)
 522
 523         self.tlb_way = tlb_way
 524         self.cache_i_validdx = cache_i_validdx
 525         self.cache_tag_set = cache_tag_set
 526         self.req_addr = req_addr
 527         self.hit_set = hit_set
 528
 529     def elaborate(self, platform):
 530         m = Module()
 531         comb = m.d.comb
 532         sync = m.d.sync
 533
 534         go = self.go
 535         virt_mode = self.virt_mode
 536         is_hit = self.is_hit
 537         tlb_way = self.tlb_way
 538         cache_i_validdx = self.cache_i_validdx
 539         cache_tag_set = self.cache_tag_set
 540         req_addr = self.req_addr
 541         tlb_hit = self.tlb_hit
 542         hit_set = self.hit_set
 543         hit_way = self.hit_way
 544         rel_match = self.rel_match
 545         req_index = self.req_index
 546         reload_tag = self.reload_tag
 547
 548         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 549                                     for i in range(TLB_NUM_WAYS))
 550         hit_way_set = HitWaySet()
 551
 552         # Test if pending request is a hit on any way
 553         # In order to make timing in virtual mode,
 554         # when we are using the TLB, we compare each
 555         # way with each of the real addresses from each way of
 556         # the TLB, and then decide later which match to use.
 557
 558         with m.If(virt_mode):
 559             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 560                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 561                 s_hit       = Signal()
 562                 s_pte       = Signal(TLB_PTE_BITS)
 563                 s_ra        = Signal(REAL_ADDR_BITS)
 564                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 565                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 566                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 567                 comb += s_tag.eq(get_tag(s_ra))
 568
 569                 for i in range(NUM_WAYS): # way_t
 570                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 571                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 572                                   (read_tag(i, cache_tag_set) == s_tag)
 573                                   & (tlb_way.valid[j]))
 574                     with m.If(is_tag_hit):
 575                         comb += hit_way_set[j].eq(i)
 576                         comb += s_hit.eq(1)
 577                 comb += hit_set[j].eq(s_hit)
 578                 with m.If(s_tag == reload_tag):
 579                     comb += rel_matches[j].eq(1)
 580             with m.If(tlb_hit.valid):
 581                 comb += is_hit.eq(hit_set[tlb_hit.way])
 582                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 583                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 584         with m.Else():
 585             s_tag       = Signal(TAG_BITS)
 586             comb += s_tag.eq(get_tag(req_addr))
 587             for i in range(NUM_WAYS): # way_t
 588                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 589                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 590                           (read_tag(i, cache_tag_set) == s_tag))
 591                 with m.If(is_tag_hit):
 592                     comb += hit_way.eq(i)
 593                     comb += is_hit.eq(1)
 594             with m.If(s_tag == reload_tag):
 595                 comb += rel_match.eq(1)
 596
 597         return m
 598
 599
 600 class DCache(Elaboratable):
 601     """Set associative dcache write-through
 602
 603     TODO (in no specific order):
 604     * See list in icache.vhdl
 605     * Complete load misses on the cycle when WB data comes instead of
 606       at the end of line (this requires dealing with requests coming in
 607       while not idle...)
 608     """
 609     def __init__(self):
 610         self.d_in      = LoadStore1ToDCacheType("d_in")
 611         self.d_out     = DCacheToLoadStore1Type("d_out")
 612
 613         self.m_in      = MMUToDCacheType("m_in")
 614         self.m_out     = DCacheToMMUType("m_out")
 615
 616         self.stall_out = Signal()
 617
 618         # standard naming (wired to non-standard for compatibility)
 619         self.bus = Interface(addr_width=32,
 620                             data_width=64,
 621                             granularity=8,
 622                             features={'stall'},
 623                             alignment=0,
 624                             name="dcache")
 625
 626         self.log_out   = Signal(20)
 627
 628     def stage_0(self, m, r0, r1, r0_full):
 629         """Latch the request in r0.req as long as we're not stalling
 630         """
 631         comb = m.d.comb
 632         sync = m.d.sync
 633         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 634
 635         r = RegStage0("stage0")
 636
 637         # TODO, this goes in unit tests and formal proofs
 638         with m.If(d_in.valid & m_in.valid):
 639             sync += Display("request collision loadstore vs MMU")
 640
 641         with m.If(m_in.valid):
 642             comb += r.req.valid.eq(1)
 643             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 644             comb += r.req.dcbz.eq(0)
 645             comb += r.req.nc.eq(0)
 646             comb += r.req.reserve.eq(0)
 647             comb += r.req.virt_mode.eq(0)
 648             comb += r.req.priv_mode.eq(1)
 649             comb += r.req.addr.eq(m_in.addr)
 650             comb += r.req.data.eq(m_in.pte)
 651             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 652             comb += r.tlbie.eq(m_in.tlbie)
 653             comb += r.doall.eq(m_in.doall)
 654             comb += r.tlbld.eq(m_in.tlbld)
 655             comb += r.mmu_req.eq(1)
 656             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 657                                  m_in.addr, m_in.pte, r.req.load)
 658
 659         with m.Else():
 660             comb += r.req.eq(d_in)
 661             comb += r.req.data.eq(0)
 662             comb += r.tlbie.eq(0)
 663             comb += r.doall.eq(0)
 664             comb += r.tlbld.eq(0)
 665             comb += r.mmu_req.eq(0)
 666         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 667             sync += r0.eq(r)
 668             sync += r0_full.eq(r.req.valid)
 669             # Sample data the cycle after a request comes in from loadstore1.
 670             # If another request has come in already then the data will get
 671             # put directly into req.data below.
 672             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 673                      ~r0.mmu_req):
 674                 sync += r0.req.data.eq(d_in.data)
 675                 sync += r0.d_valid.eq(1)
 676         with m.If(d_in.valid):
 677             m.d.sync += Display("    DCACHE req cache "
 678                                 "virt %d addr %x data %x ld %d",
 679                                  r.req.virt_mode, r.req.addr,
 680                                  r.req.data, r.req.load)
 681
 682     def tlb_read(self, m, r0_stall, tlb_way, dtlb):
 683         """TLB
 684         Operates in the second cycle on the request latched in r0.req.
 685         TLB updates write the entry at the end of the second cycle.
 686         """
 687         comb = m.d.comb
 688         sync = m.d.sync
 689         m_in, d_in = self.m_in, self.d_in
 690
 691         addrbits = Signal(TLB_SET_BITS)
 692
 693         amin = TLB_LG_PGSZ
 694         amax = TLB_LG_PGSZ + TLB_SET_BITS
 695
 696         with m.If(m_in.valid):
 697             comb += addrbits.eq(m_in.addr[amin : amax])
 698         with m.Else():
 699             comb += addrbits.eq(d_in.addr[amin : amax])
 700
 701         # If we have any op and the previous op isn't finished,
 702         # then keep the same output for next cycle.
 703         d = self.dtlb_update
 704         comb += d.tlb_read_index.eq(addrbits)
 705         comb += d.tlb_read.eq(~r0_stall)
 706         comb += tlb_way.eq(d.tlb_way)
 707
 708     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 709         """Generate TLB PLRUs
 710         """
 711         comb = m.d.comb
 712         sync = m.d.sync
 713
 714         if TLB_NUM_WAYS == 0:
 715             return
 716
 717         # Binary-to-Unary one-hot, enabled by tlb_hit valid
 718         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 719         m.submodules.tlb_plrus = tlb_plrus
 720         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 721         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 722         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 723         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 724         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 725
 726     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 727                    tlb_way,
 728                    pte, tlb_hit, valid_ra, perm_attr, ra):
 729
 730         comb = m.d.comb
 731
 732         hitway = Signal(TLB_WAY_BITS)
 733         hit    = Signal()
 734         eatag  = Signal(TLB_EA_TAG_BITS)
 735
 736         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 737         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 738         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 739
 740         for i in range(TLB_NUM_WAYS):
 741             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 742             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 743             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 744             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 745             with m.If(is_tag_hit):
 746                 comb += hitway.eq(i)
 747                 comb += hit.eq(1)
 748
 749         comb += tlb_hit.valid.eq(hit & r0_valid)
 750         comb += tlb_hit.way.eq(hitway)
 751
 752         with m.If(tlb_hit.valid):
 753             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 754         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 755
 756         with m.If(r0.req.virt_mode):
 757             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 758                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 759                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 760             comb += perm_attr.reference.eq(pte[8])
 761             comb += perm_attr.changed.eq(pte[7])
 762             comb += perm_attr.nocache.eq(pte[5])
 763             comb += perm_attr.priv.eq(pte[3])
 764             comb += perm_attr.rd_perm.eq(pte[2])
 765             comb += perm_attr.wr_perm.eq(pte[1])
 766         with m.Else():
 767             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 768                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 769             comb += perm_attr.reference.eq(1)
 770             comb += perm_attr.changed.eq(1)
 771             comb += perm_attr.nocache.eq(0)
 772             comb += perm_attr.priv.eq(1)
 773             comb += perm_attr.rd_perm.eq(1)
 774             comb += perm_attr.wr_perm.eq(1)
 775
 776         with m.If(valid_ra):
 777             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 778                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 779             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 780             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 781             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 782             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 783             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 784             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 785
 786     def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
 787                     tlb_hit, tlb_plru_victim, tlb_way):
 788
 789         comb = m.d.comb
 790         sync = m.d.sync
 791
 792         tlbie    = Signal()
 793         tlbwe    = Signal()
 794
 795         comb += tlbie.eq(r0_valid & r0.tlbie)
 796         comb += tlbwe.eq(r0_valid & r0.tlbld)
 797
 798         d = self.dtlb_update
 799
 800         comb += d.tlbie.eq(tlbie)
 801         comb += d.tlbwe.eq(tlbwe)
 802         comb += d.doall.eq(r0.doall)
 803         comb += d.tlb_hit.eq(tlb_hit)
 804         comb += d.tlb_tag_way.eq(tlb_way.tag)
 805         comb += d.tlb_pte_way.eq(tlb_way.pte)
 806         comb += d.tlb_req_index.eq(tlb_req_index)
 807
 808         with m.If(tlb_hit.valid):
 809             comb += d.repl_way.eq(tlb_hit.way)
 810         with m.Else():
 811             comb += d.repl_way.eq(tlb_plru_victim)
 812         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 813         comb += d.pte_data.eq(r0.req.data)
 814
 815     def maybe_plrus(self, m, r1, plru_victim):
 816         """Generate PLRUs
 817         """
 818         comb = m.d.comb
 819         sync = m.d.sync
 820
 821         if TLB_NUM_WAYS == 0:
 822             return
 823
 824         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 825         comb += plrus.way.eq(r1.hit_way)
 826         comb += plrus.valid.eq(r1.cache_hit)
 827         comb += plrus.index.eq(r1.hit_index)
 828         comb += plrus.isel.eq(r1.store_index) # select victim
 829         comb += plru_victim.eq(plrus.o_index) # selected victim
 830
 831     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 832         """Cache tag RAM read port
 833         """
 834         comb = m.d.comb
 835         sync = m.d.sync
 836         m_in, d_in = self.m_in, self.d_in
 837
 838         index = Signal(INDEX_BITS)
 839
 840         with m.If(r0_stall):
 841             comb += index.eq(req_index)
 842         with m.Elif(m_in.valid):
 843             comb += index.eq(get_index(m_in.addr))
 844         with m.Else():
 845             comb += index.eq(get_index(d_in.addr))
 846         sync += cache_tag_set.eq(cache_tags[index].tag)
 847
 848     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 849                        r0_valid, r1, cache_tags, replace_way,
 850                        use_forward1_next, use_forward2_next,
 851                        req_hit_way, plru_victim, rc_ok, perm_attr,
 852                        valid_ra, perm_ok, access_ok, req_op, req_go,
 853                        tlb_hit, tlb_way, cache_tag_set,
 854                        cancel_store, req_same_tag, r0_stall, early_req_row):
 855         """Cache request parsing and hit detection
 856         """
 857
 858         comb = m.d.comb
 859         m_in, d_in = self.m_in, self.d_in
 860
 861         is_hit      = Signal()
 862         hit_way     = Signal(WAY_BITS)
 863         op          = Signal(Op)
 864         opsel       = Signal(3)
 865         go          = Signal()
 866         nc          = Signal()
 867         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 868                                   for i in range(TLB_NUM_WAYS))
 869         cache_i_validdx = Signal(NUM_WAYS)
 870
 871         # Extract line, row and tag from request
 872         comb += req_index.eq(get_index(r0.req.addr))
 873         comb += req_row.eq(get_row(r0.req.addr))
 874         comb += req_tag.eq(get_tag(ra))
 875
 876         if False: # display on comb is a bit... busy.
 877             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 878                     r0.req.addr, ra, req_index, req_tag, req_row)
 879
 880         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 881         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 882
 883         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 884                                             cache_i_validdx, cache_tag_set,
 885                                             r0.req.addr,
 886                                             hit_set)
 887         comb += dc.tlb_hit.eq(tlb_hit)
 888         comb += dc.reload_tag.eq(r1.reload_tag)
 889         comb += dc.virt_mode.eq(r0.req.virt_mode)
 890         comb += dc.go.eq(go)
 891         comb += dc.req_index.eq(req_index)
 892
 893         comb += is_hit.eq(dc.is_hit)
 894         comb += hit_way.eq(dc.hit_way)
 895         comb += req_same_tag.eq(dc.rel_match)
 896
 897         # See if the request matches the line currently being reloaded
 898         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 899                   (req_index == r1.store_index) & req_same_tag):
 900             # For a store, consider this a hit even if the row isn't
 901             # valid since it will be by the time we perform the store.
 902             # For a load, check the appropriate row valid bit.
 903             rrow = Signal(ROW_LINE_BITS)
 904             comb += rrow.eq(req_row)
 905             valid = r1.rows_valid[rrow]
 906             comb += is_hit.eq((~r0.req.load) | valid)
 907             comb += hit_way.eq(replace_way)
 908
 909         # Whether to use forwarded data for a load or not
 910         with m.If((get_row(r1.req.real_addr) == req_row) &
 911                   (r1.req.hit_way == hit_way)):
 912             # Only need to consider r1.write_bram here, since if we
 913             # are writing refill data here, then we don't have a
 914             # cache hit this cycle on the line being refilled.
 915             # (There is the possibility that the load following the
 916             # load miss that started the refill could be to the old
 917             # contents of the victim line, since it is a couple of
 918             # cycles after the refill starts before we see the updated
 919             # cache tag. In that case we don't use the bypass.)
 920             comb += use_forward1_next.eq(r1.write_bram)
 921         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 922             comb += use_forward2_next.eq(r1.forward_valid1)
 923
 924         # The way that matched on a hit
 925         comb += req_hit_way.eq(hit_way)
 926
 927         # The way to replace on a miss
 928         with m.If(r1.write_tag):
 929             comb += replace_way.eq(plru_victim)
 930         with m.Else():
 931             comb += replace_way.eq(r1.store_way)
 932
 933         # work out whether we have permission for this access
 934         # NB we don't yet implement AMR, thus no KUAP
 935         comb += rc_ok.eq(perm_attr.reference
 936                          & (r0.req.load | perm_attr.changed))
 937         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 938                            (perm_attr.wr_perm |
 939                               (r0.req.load & perm_attr.rd_perm)))
 940         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 941
 942         # Combine the request and cache hit status to decide what
 943         # operation needs to be done
 944         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 945         comb += op.eq(Op.OP_NONE)
 946         with m.If(go):
 947             with m.If(~access_ok):
 948                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
 949                                  valid_ra, perm_ok, rc_ok)
 950                 comb += op.eq(Op.OP_BAD)
 951             with m.Elif(cancel_store):
 952                 m.d.sync += Display("DCACHE cancel store")
 953                 comb += op.eq(Op.OP_STCX_FAIL)
 954             with m.Else():
 955                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
 956                                  valid_ra, nc, r0.req.load)
 957                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 958                 with m.Switch(opsel):
 959                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 960                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 961                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 962                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 963                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 964                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 965                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 966                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 967         comb += req_op.eq(op)
 968         comb += req_go.eq(go)
 969
 970         # Version of the row number that is valid one cycle earlier
 971         # in the cases where we need to read the cache data BRAM.
 972         # If we're stalling then we need to keep reading the last
 973         # row requested.
 974         with m.If(~r0_stall):
 975             with m.If(m_in.valid):
 976                 comb += early_req_row.eq(get_row(m_in.addr))
 977             with m.Else():
 978                 comb += early_req_row.eq(get_row(d_in.addr))
 979         with m.Else():
 980             comb += early_req_row.eq(req_row)
 981
 982     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 983                          r0_valid, r0, reservation):
 984         """Handle load-with-reservation and store-conditional instructions
 985         """
 986         comb = m.d.comb
 987
 988         with m.If(r0_valid & r0.req.reserve):
 989             # XXX generate alignment interrupt if address
 990             # is not aligned XXX or if r0.req.nc = '1'
 991             with m.If(r0.req.load):
 992                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
 993             with m.Else():
 994                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
 995                 with m.If((~reservation.valid) |
 996                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 997                     comb += cancel_store.eq(1)
 998
 999     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1000                         reservation, r0):
1001         comb = m.d.comb
1002         sync = m.d.sync
1003
1004         with m.If(r0_valid & access_ok):
1005             with m.If(clear_rsrv):
1006                 sync += reservation.valid.eq(0)
1007             with m.Elif(set_rsrv):
1008                 sync += reservation.valid.eq(1)
1009                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1010
1011     def writeback_control(self, m, r1, cache_out_row):
1012         """Return data for loads & completion control logic
1013         """
1014         comb = m.d.comb
1015         sync = m.d.sync
1016         d_out, m_out = self.d_out, self.m_out
1017
1018         data_out = Signal(64)
1019         data_fwd = Signal(64)
1020
1021         # Use the bypass if are reading the row that was
1022         # written 1 or 2 cycles ago, including for the
1023         # slow_valid = 1 case (i.e. completing a load
1024         # miss or a non-cacheable load).
1025         with m.If(r1.use_forward1):
1026             comb += data_fwd.eq(r1.forward_data1)
1027         with m.Else():
1028             comb += data_fwd.eq(r1.forward_data2)
1029
1030         comb += data_out.eq(cache_out_row)
1031
1032         for i in range(8):
1033             with m.If(r1.forward_sel[i]):
1034                 dsel = data_fwd.word_select(i, 8)
1035                 comb += data_out.word_select(i, 8).eq(dsel)
1036
1037         # DCache output to LoadStore
1038         comb += d_out.valid.eq(r1.ls_valid)
1039         comb += d_out.data.eq(data_out)
1040         comb += d_out.store_done.eq(~r1.stcx_fail)
1041         comb += d_out.error.eq(r1.ls_error)
1042         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1043
1044         # Outputs to MMU
1045         comb += m_out.done.eq(r1.mmu_done)
1046         comb += m_out.err.eq(r1.mmu_error)
1047         comb += m_out.data.eq(data_out)
1048
1049         # We have a valid load or store hit or we just completed
1050         # a slow op such as a load miss, a NC load or a store
1051         #
1052         # Note: the load hit is delayed by one cycle. However it
1053         # can still not collide with r.slow_valid (well unless I
1054         # miscalculated) because slow_valid can only be set on a
1055         # subsequent request and not on its first cycle (the state
1056         # machine must have advanced), which makes slow_valid
1057         # at least 2 cycles from the previous hit_load_valid.
1058
1059         # Sanity: Only one of these must be set in any given cycle
1060
1061         if False: # TODO: need Display to get this to work
1062             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1063             "unexpected slow_valid collision with stcx_fail"
1064
1065             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1066              "unexpected hit_load_delayed collision with slow_valid"
1067
1068         with m.If(~r1.mmu_req):
1069             # Request came from loadstore1...
1070             # Load hit case is the standard path
1071             with m.If(r1.hit_load_valid):
1072                 sync += Display("completing load hit data=%x", data_out)
1073
1074             # error cases complete without stalling
1075             with m.If(r1.ls_error):
1076                 with m.If(r1.dcbz):
1077                     sync += Display("completing dcbz with error")
1078                 with m.Else():
1079                     sync += Display("completing ld/st with error")
1080
1081             # Slow ops (load miss, NC, stores)
1082             with m.If(r1.slow_valid):
1083                 sync += Display("completing store or load miss adr=%x data=%x",
1084                                 r1.req.real_addr, data_out)
1085
1086         with m.Else():
1087             # Request came from MMU
1088             with m.If(r1.hit_load_valid):
1089                 sync += Display("completing load hit to MMU, data=%x",
1090                                 m_out.data)
1091             # error cases complete without stalling
1092             with m.If(r1.mmu_error):
1093                 sync += Display("combpleting MMU ld with error")
1094
1095             # Slow ops (i.e. load miss)
1096             with m.If(r1.slow_valid):
1097                 sync += Display("completing MMU load miss, adr=%x data=%x",
1098                                 r1.req.real_addr, m_out.data)
1099
1100     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1101         """rams
1102         Generate a cache RAM for each way. This handles the normal
1103         reads, writes from reloads and the special store-hit update
1104         path as well.
1105
1106         Note: the BRAMs have an extra read buffer, meaning the output
1107         is pipelined an extra cycle. This differs from the
1108         icache. The writeback logic needs to take that into
1109         account by using 1-cycle delayed signals for load hits.
1110         """
1111         comb = m.d.comb
1112         bus = self.bus
1113
1114         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1115         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1116         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1117         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1118                    ~r1.write_bram))
1119         comb += rwe.i.eq(replace_way)
1120
1121         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1122         comb += hwe.i.eq(r1.hit_way)
1123
1124         # this one is gated with write_bram, and replace_way_e can never be
1125         # set at the same time.  that means that do_write can OR the outputs
1126         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1127         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1128         comb += hre.i.eq(r1.req.hit_way)
1129
1130         # common Signals
1131         do_read  = Signal()
1132         wr_addr  = Signal(ROW_BITS)
1133         wr_data  = Signal(WB_DATA_BITS)
1134         wr_sel   = Signal(ROW_SIZE)
1135         rd_addr  = Signal(ROW_BITS)
1136
1137         comb += do_read.eq(1) # always enable
1138         comb += rd_addr.eq(early_req_row)
1139
1140         # Write mux:
1141         #
1142         # Defaults to wishbone read responses (cache refill)
1143         #
1144         # For timing, the mux on wr_data/sel/addr is not
1145         # dependent on anything other than the current state.
1146
1147         with m.If(r1.write_bram):
1148             # Write store data to BRAM.  This happens one
1149             # cycle after the store is in r0.
1150             comb += wr_data.eq(r1.req.data)
1151             comb += wr_sel.eq(r1.req.byte_sel)
1152             comb += wr_addr.eq(get_row(r1.req.real_addr))
1153
1154         with m.Else():
1155             # Otherwise, we might be doing a reload or a DCBZ
1156             with m.If(r1.dcbz):
1157                 comb += wr_data.eq(0)
1158             with m.Else():
1159                 comb += wr_data.eq(bus.dat_r)
1160             comb += wr_addr.eq(r1.store_row)
1161             comb += wr_sel.eq(~0) # all 1s
1162
1163         # set up Cache Rams
1164         for i in range(NUM_WAYS):
1165             do_write = Signal(name="do_wr%d" % i)
1166             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1167             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1168
1169             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1170             setattr(m.submodules, "cacheram_%d" % i, way)
1171
1172             comb += way.rd_en.eq(do_read)
1173             comb += way.rd_addr.eq(rd_addr)
1174             comb += d_out.eq(way.rd_data_o)
1175             comb += way.wr_sel.eq(wr_sel_m)
1176             comb += way.wr_addr.eq(wr_addr)
1177             comb += way.wr_data.eq(wr_data)
1178
1179             # Cache hit reads
1180             with m.If(hwe.o[i]):
1181                 comb += cache_out_row.eq(d_out)
1182
1183             # these are mutually-exclusive via their Decoder-enablers
1184             # (note: Decoder-enable is inverted)
1185             comb += do_write.eq(hre.o[i] | rwe.o[i])
1186
1187             # Mask write selects with do_write since BRAM
1188             # doesn't have a global write-enable
1189             with m.If(do_write):
1190                 comb += wr_sel_m.eq(wr_sel)
1191
1192     # Cache hit synchronous machine for the easy case.
1193     # This handles load hits.
1194     # It also handles error cases (TLB miss, cache paradox)
1195     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1196                         req_hit_way, req_index, req_tag, access_ok,
1197                         tlb_hit, tlb_req_index):
1198         comb = m.d.comb
1199         sync = m.d.sync
1200
1201         with m.If(req_op != Op.OP_NONE):
1202             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1203                     req_op, r0.req.addr, r0.req.nc,
1204                     req_index, req_tag, req_hit_way)
1205
1206         with m.If(r0_valid):
1207             sync += r1.mmu_req.eq(r0.mmu_req)
1208
1209         # Fast path for load/store hits.
1210         # Set signals for the writeback controls.
1211         sync += r1.hit_way.eq(req_hit_way)
1212         sync += r1.hit_index.eq(req_index)
1213
1214         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1215         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1216                                 (req_op == Op.OP_STORE_HIT))
1217
1218         with m.If(req_op == Op.OP_BAD):
1219             sync += Display("Signalling ld/st error "
1220                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1221                             ~r0.mmu_req,r0.mmu_req,access_ok)
1222             sync += r1.ls_error.eq(~r0.mmu_req)
1223             sync += r1.mmu_error.eq(r0.mmu_req)
1224             sync += r1.cache_paradox.eq(access_ok)
1225         with m.Else():
1226             sync += r1.ls_error.eq(0)
1227             sync += r1.mmu_error.eq(0)
1228             sync += r1.cache_paradox.eq(0)
1229
1230         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1231
1232         # Record TLB hit information for updating TLB PLRU
1233         sync += r1.tlb_hit.eq(tlb_hit)
1234         sync += r1.tlb_hit_index.eq(tlb_req_index)
1235
1236     # Memory accesses are handled by this state machine:
1237     #
1238     #   * Cache load miss/reload (in conjunction with "rams")
1239     #   * Load hits for non-cachable forms
1240     #   * Stores (the collision case is handled in "rams")
1241     #
1242     # All wishbone requests generation is done here.
1243     # This machine operates at stage 1.
1244     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1245                     r0, replace_way,
1246                     req_hit_way, req_same_tag,
1247                     r0_valid, req_op, cache_tags, req_go, ra):
1248
1249         comb = m.d.comb
1250         sync = m.d.sync
1251         bus = self.bus
1252         d_in = self.d_in
1253
1254         req         = MemAccessRequest("mreq_ds")
1255
1256         req_row = Signal(ROW_BITS)
1257         req_idx = Signal(INDEX_BITS)
1258         req_tag = Signal(TAG_BITS)
1259         comb += req_idx.eq(get_index(req.real_addr))
1260         comb += req_row.eq(get_row(req.real_addr))
1261         comb += req_tag.eq(get_tag(req.real_addr))
1262
1263         sync += r1.use_forward1.eq(use_forward1_next)
1264         sync += r1.forward_sel.eq(0)
1265
1266         with m.If(use_forward1_next):
1267             sync += r1.forward_sel.eq(r1.req.byte_sel)
1268         with m.Elif(use_forward2_next):
1269             sync += r1.forward_sel.eq(r1.forward_sel1)
1270
1271         sync += r1.forward_data2.eq(r1.forward_data1)
1272         with m.If(r1.write_bram):
1273             sync += r1.forward_data1.eq(r1.req.data)
1274             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1275             sync += r1.forward_way1.eq(r1.req.hit_way)
1276             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1277             sync += r1.forward_valid1.eq(1)
1278         with m.Else():
1279             with m.If(r1.dcbz):
1280                 sync += r1.forward_data1.eq(0)
1281             with m.Else():
1282                 sync += r1.forward_data1.eq(bus.dat_r)
1283             sync += r1.forward_sel1.eq(~0) # all 1s
1284             sync += r1.forward_way1.eq(replace_way)
1285             sync += r1.forward_row1.eq(r1.store_row)
1286             sync += r1.forward_valid1.eq(0)
1287
1288         # One cycle pulses reset
1289         sync += r1.slow_valid.eq(0)
1290         sync += r1.write_bram.eq(0)
1291         sync += r1.inc_acks.eq(0)
1292         sync += r1.dec_acks.eq(0)
1293
1294         sync += r1.ls_valid.eq(0)
1295         # complete tlbies and TLB loads in the third cycle
1296         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1297
1298         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1299             with m.If(~r0.mmu_req):
1300                 sync += r1.ls_valid.eq(1)
1301             with m.Else():
1302                 sync += r1.mmu_done.eq(1)
1303
1304         with m.If(r1.write_tag):
1305             # Store new tag in selected way
1306             replace_way_onehot = Signal(NUM_WAYS)
1307             comb += replace_way_onehot.eq(1<<replace_way)
1308             for i in range(NUM_WAYS):
1309                 with m.If(replace_way_onehot[i]):
1310                     ct = Signal(TAG_RAM_WIDTH)
1311                     comb += ct.eq(cache_tags[r1.store_index].tag)
1312                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1313                     sync += cache_tags[r1.store_index].tag.eq(ct)
1314             sync += r1.store_way.eq(replace_way)
1315             sync += r1.write_tag.eq(0)
1316
1317         # Take request from r1.req if there is one there,
1318         # else from req_op, ra, etc.
1319         with m.If(r1.full):
1320             comb += req.eq(r1.req)
1321         with m.Else():
1322             comb += req.op.eq(req_op)
1323             comb += req.valid.eq(req_go)
1324             comb += req.mmu_req.eq(r0.mmu_req)
1325             comb += req.dcbz.eq(r0.req.dcbz)
1326             comb += req.real_addr.eq(ra)
1327
1328             with m.If(r0.req.dcbz):
1329                 # force data to 0 for dcbz
1330                 comb += req.data.eq(0)
1331             with m.Elif(r0.d_valid):
1332                 comb += req.data.eq(r0.req.data)
1333             with m.Else():
1334                 comb += req.data.eq(d_in.data)
1335
1336             # Select all bytes for dcbz
1337             # and for cacheable loads
1338             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1339                 comb += req.byte_sel.eq(~0) # all 1s
1340             with m.Else():
1341                 comb += req.byte_sel.eq(r0.req.byte_sel)
1342             comb += req.hit_way.eq(req_hit_way)
1343             comb += req.same_tag.eq(req_same_tag)
1344
1345             # Store the incoming request from r0,
1346             # if it is a slow request
1347             # Note that r1.full = 1 implies req_op = OP_NONE
1348             with m.If((req_op == Op.OP_LOAD_MISS)
1349                       | (req_op == Op.OP_LOAD_NC)
1350                       | (req_op == Op.OP_STORE_MISS)
1351                       | (req_op == Op.OP_STORE_HIT)):
1352                 sync += r1.req.eq(req)
1353                 sync += r1.full.eq(1)
1354
1355         # Main state machine
1356         with m.Switch(r1.state):
1357
1358             with m.Case(State.IDLE):
1359                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1360                 sync += r1.wb.sel.eq(req.byte_sel)
1361                 sync += r1.wb.dat.eq(req.data)
1362                 sync += r1.dcbz.eq(req.dcbz)
1363
1364                 # Keep track of our index and way
1365                 # for subsequent stores.
1366                 sync += r1.store_index.eq(req_idx)
1367                 sync += r1.store_row.eq(req_row)
1368                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1369                 sync += r1.reload_tag.eq(req_tag)
1370                 sync += r1.req.same_tag.eq(1)
1371
1372                 with m.If(req.op == Op.OP_STORE_HIT):
1373                     sync += r1.store_way.eq(req.hit_way)
1374
1375                 # Reset per-row valid bits,
1376                 # ready for handling OP_LOAD_MISS
1377                 for i in range(ROW_PER_LINE):
1378                     sync += r1.rows_valid[i].eq(0)
1379
1380                 with m.If(req_op != Op.OP_NONE):
1381                     sync += Display("cache op %d", req.op)
1382
1383                 with m.Switch(req.op):
1384                     with m.Case(Op.OP_LOAD_HIT):
1385                         # stay in IDLE state
1386                         pass
1387
1388                     with m.Case(Op.OP_LOAD_MISS):
1389                         sync += Display("cache miss real addr: %x " \
1390                                 "idx: %x tag: %x",
1391                                 req.real_addr, req_row, req_tag)
1392
1393                         # Start the wishbone cycle
1394                         sync += r1.wb.we.eq(0)
1395                         sync += r1.wb.cyc.eq(1)
1396                         sync += r1.wb.stb.eq(1)
1397
1398                         # Track that we had one request sent
1399                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1400                         sync += r1.write_tag.eq(1)
1401
1402                     with m.Case(Op.OP_LOAD_NC):
1403                         sync += r1.wb.cyc.eq(1)
1404                         sync += r1.wb.stb.eq(1)
1405                         sync += r1.wb.we.eq(0)
1406                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1407
1408                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1409                         with m.If(~req.dcbz):
1410                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1411                             sync += r1.acks_pending.eq(1)
1412                             sync += r1.full.eq(0)
1413                             sync += r1.slow_valid.eq(1)
1414
1415                             with m.If(~req.mmu_req):
1416                                 sync += r1.ls_valid.eq(1)
1417                             with m.Else():
1418                                 sync += r1.mmu_done.eq(1)
1419
1420                             with m.If(req.op == Op.OP_STORE_HIT):
1421                                 sync += r1.write_bram.eq(1)
1422                         with m.Else():
1423                             # dcbz is handled much like a load miss except
1424                             # that we are writing to memory instead of reading
1425                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1426
1427                             with m.If(req.op == Op.OP_STORE_MISS):
1428                                 sync += r1.write_tag.eq(1)
1429
1430                         sync += r1.wb.we.eq(1)
1431                         sync += r1.wb.cyc.eq(1)
1432                         sync += r1.wb.stb.eq(1)
1433
1434                     # OP_NONE and OP_BAD do nothing
1435                     # OP_BAD & OP_STCX_FAIL were
1436                     # handled above already
1437                     with m.Case(Op.OP_NONE):
1438                         pass
1439                     with m.Case(Op.OP_BAD):
1440                         pass
1441                     with m.Case(Op.OP_STCX_FAIL):
1442                         pass
1443
1444             with m.Case(State.RELOAD_WAIT_ACK):
1445                 ld_stbs_done = Signal()
1446                 # Requests are all sent if stb is 0
1447                 comb += ld_stbs_done.eq(~r1.wb.stb)
1448
1449                 # If we are still sending requests, was one accepted?
1450                 with m.If((~bus.stall) & r1.wb.stb):
1451                     # That was the last word?  We are done sending.
1452                     # Clear stb and set ld_stbs_done so we can handle an
1453                     # eventual last ack on the same cycle.
1454                     # sigh - reconstruct wb adr with 3 extra 0s at front
1455                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1456                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1457                         sync += r1.wb.stb.eq(0)
1458                         comb += ld_stbs_done.eq(1)
1459
1460                     # Calculate the next row address in the current cache line
1461                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1462                     comb += row.eq(r1.wb.adr)
1463                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1464
1465                 # Incoming acks processing
1466                 sync += r1.forward_valid1.eq(bus.ack)
1467                 with m.If(bus.ack):
1468                     srow = Signal(ROW_LINE_BITS)
1469                     comb += srow.eq(r1.store_row)
1470                     sync += r1.rows_valid[srow].eq(1)
1471
1472                     # If this is the data we were looking for,
1473                     # we can complete the request next cycle.
1474                     # Compare the whole address in case the
1475                     # request in r1.req is not the one that
1476                     # started this refill.
1477                     with m.If(req.valid & r1.req.same_tag &
1478                               ((r1.dcbz & r1.req.dcbz) |
1479                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1480                                 (r1.store_row == get_row(req.real_addr))):
1481                         sync += r1.full.eq(0)
1482                         sync += r1.slow_valid.eq(1)
1483                         with m.If(~r1.mmu_req):
1484                             sync += r1.ls_valid.eq(1)
1485                         with m.Else():
1486                             sync += r1.mmu_done.eq(1)
1487                         sync += r1.forward_sel.eq(~0) # all 1s
1488                         sync += r1.use_forward1.eq(1)
1489
1490                     # Check for completion
1491                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1492                                                       r1.end_row_ix)):
1493                         # Complete wishbone cycle
1494                         sync += r1.wb.cyc.eq(0)
1495
1496                         # Cache line is now valid
1497                         cv = Signal(INDEX_BITS)
1498                         comb += cv.eq(cache_tags[r1.store_index].valid)
1499                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1500                         sync += cache_tags[r1.store_index].valid.eq(cv)
1501
1502                         sync += r1.state.eq(State.IDLE)
1503                         sync += Display("cache valid set %x "
1504                                         "idx %d way %d",
1505                                          cv, r1.store_index, r1.store_way)
1506
1507                     # Increment store row counter
1508                     sync += r1.store_row.eq(next_row(r1.store_row))
1509
1510             with m.Case(State.STORE_WAIT_ACK):
1511                 st_stbs_done = Signal()
1512                 acks        = Signal(3)
1513                 adjust_acks = Signal(3)
1514
1515                 comb += st_stbs_done.eq(~r1.wb.stb)
1516                 comb += acks.eq(r1.acks_pending)
1517
1518                 with m.If(r1.inc_acks != r1.dec_acks):
1519                     with m.If(r1.inc_acks):
1520                         comb += adjust_acks.eq(acks + 1)
1521                     with m.Else():
1522                         comb += adjust_acks.eq(acks - 1)
1523                 with m.Else():
1524                     comb += adjust_acks.eq(acks)
1525
1526                 sync += r1.acks_pending.eq(adjust_acks)
1527
1528                 # Clear stb when slave accepted request
1529                 with m.If(~bus.stall):
1530                     # See if there is another store waiting
1531                     # to be done which is in the same real page.
1532                     with m.If(req.valid):
1533                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1534                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1535                         sync += r1.wb.dat.eq(req.data)
1536                         sync += r1.wb.sel.eq(req.byte_sel)
1537
1538                     with m.If((adjust_acks < 7) & req.same_tag &
1539                                 ((req.op == Op.OP_STORE_MISS)
1540                                  | (req.op == Op.OP_STORE_HIT))):
1541                         sync += r1.wb.stb.eq(1)
1542                         comb += st_stbs_done.eq(0)
1543
1544                         with m.If(req.op == Op.OP_STORE_HIT):
1545                             sync += r1.write_bram.eq(1)
1546                         sync += r1.full.eq(0)
1547                         sync += r1.slow_valid.eq(1)
1548
1549                         # Store requests never come from the MMU
1550                         sync += r1.ls_valid.eq(1)
1551                         comb += st_stbs_done.eq(0)
1552                         sync += r1.inc_acks.eq(1)
1553                     with m.Else():
1554                         sync += r1.wb.stb.eq(0)
1555                         comb += st_stbs_done.eq(1)
1556
1557                 # Got ack ? See if complete.
1558                 with m.If(bus.ack):
1559                     with m.If(st_stbs_done & (adjust_acks == 1)):
1560                         sync += r1.state.eq(State.IDLE)
1561                         sync += r1.wb.cyc.eq(0)
1562                         sync += r1.wb.stb.eq(0)
1563                     sync += r1.dec_acks.eq(1)
1564
1565             with m.Case(State.NC_LOAD_WAIT_ACK):
1566                 # Clear stb when slave accepted request
1567                 with m.If(~bus.stall):
1568                     sync += r1.wb.stb.eq(0)
1569
1570                 # Got ack ? complete.
1571                 with m.If(bus.ack):
1572                     sync += r1.state.eq(State.IDLE)
1573                     sync += r1.full.eq(0)
1574                     sync += r1.slow_valid.eq(1)
1575
1576                     with m.If(~r1.mmu_req):
1577                         sync += r1.ls_valid.eq(1)
1578                     with m.Else():
1579                         sync += r1.mmu_done.eq(1)
1580
1581                     sync += r1.forward_sel.eq(~0) # all 1s
1582                     sync += r1.use_forward1.eq(1)
1583                     sync += r1.wb.cyc.eq(0)
1584                     sync += r1.wb.stb.eq(0)
1585
1586     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1587
1588         sync = m.d.sync
1589         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1590
1591         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1592                                stall_out, req_op[:3], d_out.valid, d_out.error,
1593                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1594                                r1.real_adr[3:6]))
1595
1596     def elaborate(self, platform):
1597
1598         m = Module()
1599         comb = m.d.comb
1600         d_in = self.d_in
1601
1602         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1603         cache_tags       = CacheTagArray()
1604         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1605
1606         # TODO attribute ram_style : string;
1607         # TODO attribute ram_style of cache_tags : signal is "distributed";
1608
1609         """note: these are passed to nmigen.hdl.Memory as "attributes".
1610            don't know how, just that they are.
1611         """
1612         # TODO attribute ram_style of
1613         #  dtlb_tags : signal is "distributed";
1614         # TODO attribute ram_style of
1615         #  dtlb_ptes : signal is "distributed";
1616
1617         r0      = RegStage0("r0")
1618         r0_full = Signal()
1619
1620         r1 = RegStage1("r1")
1621
1622         reservation = Reservation()
1623
1624         # Async signals on incoming request
1625         req_index    = Signal(INDEX_BITS)
1626         req_row      = Signal(ROW_BITS)
1627         req_hit_way  = Signal(WAY_BITS)
1628         req_tag      = Signal(TAG_BITS)
1629         req_op       = Signal(Op)
1630         req_data     = Signal(64)
1631         req_same_tag = Signal()
1632         req_go       = Signal()
1633
1634         early_req_row     = Signal(ROW_BITS)
1635
1636         cancel_store      = Signal()
1637         set_rsrv          = Signal()
1638         clear_rsrv        = Signal()
1639
1640         r0_valid          = Signal()
1641         r0_stall          = Signal()
1642
1643         use_forward1_next = Signal()
1644         use_forward2_next = Signal()
1645
1646         cache_out_row     = Signal(WB_DATA_BITS)
1647
1648         plru_victim       = Signal(WAY_BITS)
1649         replace_way       = Signal(WAY_BITS)
1650
1651         # Wishbone read/write/cache write formatting signals
1652         bus_sel           = Signal(8)
1653
1654         # TLB signals
1655         tlb_way       = TLBRecord("tlb_way")
1656         tlb_req_index = Signal(TLB_SET_BITS)
1657         tlb_hit       = TLBHit("tlb_hit")
1658         pte           = Signal(TLB_PTE_BITS)
1659         ra            = Signal(REAL_ADDR_BITS)
1660         valid_ra      = Signal()
1661         perm_attr     = PermAttr("dc_perms")
1662         rc_ok         = Signal()
1663         perm_ok       = Signal()
1664         access_ok     = Signal()
1665
1666         tlb_plru_victim = Signal(TLB_WAY_BITS)
1667
1668         # we don't yet handle collisions between loadstore1 requests
1669         # and MMU requests
1670         comb += self.m_out.stall.eq(0)
1671
1672         # Hold off the request in r0 when r1 has an uncompleted request
1673         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1674         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1675         comb += self.stall_out.eq(r0_stall)
1676
1677         # deal with litex not doing wishbone pipeline mode
1678         # XXX in wrong way.  FIFOs are needed in the SRAM test
1679         # so that stb/ack match up. same thing done in icache.py
1680         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1681
1682         # Wire up wishbone request latch out of stage 1
1683         comb += self.bus.we.eq(r1.wb.we)
1684         comb += self.bus.adr.eq(r1.wb.adr)
1685         comb += self.bus.sel.eq(r1.wb.sel)
1686         comb += self.bus.stb.eq(r1.wb.stb)
1687         comb += self.bus.dat_w.eq(r1.wb.dat)
1688         comb += self.bus.cyc.eq(r1.wb.cyc)
1689
1690         # create submodule TLBUpdate
1691         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1692         dtlb = self.dtlb_update.dtlb
1693
1694         # call sub-functions putting everything together, using shared
1695         # signals established above
1696         self.stage_0(m, r0, r1, r0_full)
1697         self.tlb_read(m, r0_stall, tlb_way, dtlb)
1698         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1699                         tlb_way,
1700                         pte, tlb_hit, valid_ra, perm_attr, ra)
1701         self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
1702                         tlb_hit, tlb_plru_victim,
1703                         tlb_way)
1704         self.maybe_plrus(m, r1, plru_victim)
1705         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1706         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1707         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1708                            r0_valid, r1, cache_tags, replace_way,
1709                            use_forward1_next, use_forward2_next,
1710                            req_hit_way, plru_victim, rc_ok, perm_attr,
1711                            valid_ra, perm_ok, access_ok, req_op, req_go,
1712                            tlb_hit, tlb_way, cache_tag_set,
1713                            cancel_store, req_same_tag, r0_stall, early_req_row)
1714         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1715                            r0_valid, r0, reservation)
1716         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1717                            reservation, r0)
1718         self.writeback_control(m, r1, cache_out_row)
1719         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1720         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1721                         req_hit_way, req_index, req_tag, access_ok,
1722                         tlb_hit, tlb_req_index)
1723         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1724                     r0, replace_way,
1725                     req_hit_way, req_same_tag,
1726                          r0_valid, req_op, cache_tags, req_go, ra)
1727         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1728
1729         return m
1730
1731
1732 if __name__ == '__main__':
1733     dut = DCache()
1734     vl = rtlil.convert(dut, ports=[])
1735     with open("test_dcache.il", "w") as f:
1736         f.write(vl)