src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record)
  29 from nmutil.util import Display
  30 from nmigen.lib.coding import Decoder
  31
  32 from copy import deepcopy
  33 from random import randint, seed
  34
  35 from nmigen_soc.wishbone.bus import Interface
  36
  37 from nmigen.cli import main
  38 from nmutil.iocontrol import RecordObject
  39 from nmigen.utils import log2_int
  40 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  41                                      DCacheToLoadStore1Type,
  42                                      MMUToDCacheType,
  43                                      DCacheToMMUType)
  44
  45 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  46                                 WBAddrType, WBDataType, WBSelType,
  47                                 WBMasterOut, WBSlaveOut,
  48                                 WBMasterOutVector, WBSlaveOutVector,
  49                                 WBIOMasterOut, WBIOSlaveOut)
  50
  51 from soc.experiment.cache_ram import CacheRam
  52 #from soc.experiment.plru import PLRU
  53 from nmutil.plru import PLRU, PLRUs
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmigen.cli import rtlil
  59
  60 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  61 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  62 from nmutil.sim_tmp_alternative import Simulator
  63
  64 from nmutil.util import wrap
  65
  66
  67 # TODO: make these parameters of DCache at some point
  68 LINE_SIZE = 64    # Line size in bytes
  69 NUM_LINES = 16    # Number of lines in a set
  70 NUM_WAYS = 4      # Number of ways
  71 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  72 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  73 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  74 LOG_LENGTH = 0    # Non-zero to enable log data collection
  75
  76 # BRAM organisation: We never access more than
  77 #     -- WB_DATA_BITS at a time so to save
  78 #     -- resources we make the array only that wide, and
  79 #     -- use consecutive indices to make a cache "line"
  80 #     --
  81 #     -- ROW_SIZE is the width in bytes of the BRAM
  82 #     -- (based on WB, so 64-bits)
  83 ROW_SIZE = WB_DATA_BITS // 8;
  84
  85 # ROW_PER_LINE is the number of row (wishbone
  86 # transactions) in a line
  87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  88
  89 # BRAM_ROWS is the number of rows in BRAM needed
  90 # to represent the full dcache
  91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  92
  93 print ("ROW_SIZE", ROW_SIZE)
  94 print ("ROW_PER_LINE", ROW_PER_LINE)
  95 print ("BRAM_ROWS", BRAM_ROWS)
  96 print ("NUM_WAYS", NUM_WAYS)
  97
  98 # Bit fields counts in the address
  99
 100 # REAL_ADDR_BITS is the number of real address
 101 # bits that we store
 102 REAL_ADDR_BITS = 56
 103
 104 # ROW_BITS is the number of bits to select a row
 105 ROW_BITS = log2_int(BRAM_ROWS)
 106
 107 # ROW_LINE_BITS is the number of bits to select
 108 # a row within a line
 109 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 110
 111 # LINE_OFF_BITS is the number of bits for
 112 # the offset in a cache line
 113 LINE_OFF_BITS = log2_int(LINE_SIZE)
 114
 115 # ROW_OFF_BITS is the number of bits for
 116 # the offset in a row
 117 ROW_OFF_BITS = log2_int(ROW_SIZE)
 118
 119 # INDEX_BITS is the number if bits to
 120 # select a cache line
 121 INDEX_BITS = log2_int(NUM_LINES)
 122
 123 # SET_SIZE_BITS is the log base 2 of the set size
 124 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 125
 126 # TAG_BITS is the number of bits of
 127 # the tag part of the address
 128 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 129
 130 # TAG_WIDTH is the width in bits of each way of the tag RAM
 131 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 132
 133 # WAY_BITS is the number of bits to select a way
 134 WAY_BITS = log2_int(NUM_WAYS)
 135
 136 # Example of layout for 32 lines of 64 bytes:
 137 layout = """\
 138   ..  tag    |index|  line  |
 139   ..         |   row   |    |
 140   ..         |     |---|    | ROW_LINE_BITS  (3)
 141   ..         |     |--- - --| LINE_OFF_BITS (6)
 142   ..         |         |- --| ROW_OFF_BITS  (3)
 143   ..         |----- ---|    | ROW_BITS      (8)
 144   ..         |-----|        | INDEX_BITS    (5)
 145   .. --------|              | TAG_BITS      (45)
 146 """
 147 print (layout)
 148 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 149             (TAG_BITS, INDEX_BITS, ROW_BITS,
 150              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 151 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 152 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 153 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 154
 155 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 156
 157 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 158
 159 def CacheTagArray():
 160     tag_layout = [('valid', 1),
 161                   ('tag', TAG_RAM_WIDTH),
 162                  ]
 163     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 164
 165 def RowPerLineValidArray():
 166     return Array(Signal(name="rows_valid%d" % x) \
 167                         for x in range(ROW_PER_LINE))
 168
 169 # L1 TLB
 170 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 171 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 172 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 173 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 174 TLB_PTE_BITS     = 64
 175 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 176
 177 def ispow2(x):
 178     return (1<<log2_int(x, False)) == x
 179
 180 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 181 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 182 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 183 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 184 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 185 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 186         "geometry bits don't add up"
 187 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 188         "geometry bits don't add up"
 189 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 190          "geometry bits don't add up"
 191 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 192 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 193
 194 def TLBHit(name):
 195     return Record([('valid', 1),
 196                    ('way', TLB_WAY_BITS)], name=name)
 197
 198 def TLBTagEAArray():
 199     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 200                 for x in range (TLB_NUM_WAYS))
 201
 202 def TLBRecord(name):
 203     tlb_layout = [('valid', TLB_NUM_WAYS),
 204                   ('tag', TLB_TAG_WAY_BITS),
 205                   ('pte', TLB_PTE_WAY_BITS)
 206                  ]
 207     return Record(tlb_layout, name=name)
 208
 209 def TLBArray():
 210     return Array(TLBRecord(name="tlb%d" % x) for x in range(TLB_SET_SIZE))
 211
 212 def HitWaySet():
 213     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 214                         for x in range(TLB_NUM_WAYS))
 215
 216 # Cache RAM interface
 217 def CacheRamOut():
 218     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 219                  for x in range(NUM_WAYS))
 220
 221 # PLRU output interface
 222 def PLRUOut():
 223     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 224                 for x in range(NUM_LINES))
 225
 226 # TLB PLRU output interface
 227 def TLBPLRUOut():
 228     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 229                 for x in range(TLB_SET_SIZE))
 230
 231 # Helper functions to decode incoming requests
 232 #
 233 # Return the cache line index (tag index) for an address
 234 def get_index(addr):
 235     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 236
 237 # Return the cache row index (data memory) for an address
 238 def get_row(addr):
 239     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 240
 241 # Return the index of a row within a line
 242 def get_row_of_line(row):
 243     return row[:ROW_BITS][:ROW_LINE_BITS]
 244
 245 # Returns whether this is the last row of a line
 246 def is_last_row_addr(addr, last):
 247     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 248
 249 # Returns whether this is the last row of a line
 250 def is_last_row(row, last):
 251     return get_row_of_line(row) == last
 252
 253 # Return the next row in the current cache line. We use a
 254 # dedicated function in order to limit the size of the
 255 # generated adder to be only the bits within a cache line
 256 # (3 bits with default settings)
 257 def next_row(row):
 258     row_v = row[0:ROW_LINE_BITS] + 1
 259     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 260
 261 # Get the tag value from the address
 262 def get_tag(addr):
 263     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 264
 265 # Read a tag from a tag memory row
 266 def read_tag(way, tagset):
 267     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 268
 269 # Read a TLB tag from a TLB tag memory row
 270 def read_tlb_tag(way, tags):
 271     return tags.word_select(way, TLB_EA_TAG_BITS)
 272
 273 # Write a TLB tag to a TLB tag memory row
 274 def write_tlb_tag(way, tags, tag):
 275     return read_tlb_tag(way, tags).eq(tag)
 276
 277 # Read a PTE from a TLB PTE memory row
 278 def read_tlb_pte(way, ptes):
 279     return ptes.word_select(way, TLB_PTE_BITS)
 280
 281 def write_tlb_pte(way, ptes, newpte):
 282     return read_tlb_pte(way, ptes).eq(newpte)
 283
 284
 285 # Record for storing permission, attribute, etc. bits from a PTE
 286 class PermAttr(RecordObject):
 287     def __init__(self, name=None):
 288         super().__init__(name=name)
 289         self.reference = Signal()
 290         self.changed   = Signal()
 291         self.nocache   = Signal()
 292         self.priv      = Signal()
 293         self.rd_perm   = Signal()
 294         self.wr_perm   = Signal()
 295
 296
 297 def extract_perm_attr(pte):
 298     pa = PermAttr()
 299     return pa;
 300
 301
 302 # Type of operation on a "valid" input
 303 @unique
 304 class Op(Enum):
 305     OP_NONE       = 0
 306     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 307     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 308     OP_LOAD_HIT   = 3 # Cache hit on load
 309     OP_LOAD_MISS  = 4 # Load missing cache
 310     OP_LOAD_NC    = 5 # Non-cachable load
 311     OP_STORE_HIT  = 6 # Store hitting cache
 312     OP_STORE_MISS = 7 # Store missing cache
 313
 314
 315 # Cache state machine
 316 @unique
 317 class State(Enum):
 318     IDLE             = 0 # Normal load hit processing
 319     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 320     STORE_WAIT_ACK   = 2 # Store wait ack
 321     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 322
 323
 324 # Dcache operations:
 325 #
 326 # In order to make timing, we use the BRAMs with
 327 # an output buffer, which means that the BRAM
 328 # output is delayed by an extra cycle.
 329 #
 330 # Thus, the dcache has a 2-stage internal pipeline
 331 # for cache hits with no stalls.
 332 #
 333 # All other operations are handled via stalling
 334 # in the first stage.
 335 #
 336 # The second stage can thus complete a hit at the same
 337 # time as the first stage emits a stall for a complex op.
 338 #
 339 # Stage 0 register, basically contains just the latched request
 340
 341 class RegStage0(RecordObject):
 342     def __init__(self, name=None):
 343         super().__init__(name=name)
 344         self.req     = LoadStore1ToDCacheType(name="lsmem")
 345         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 346         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 347         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 348         self.mmu_req = Signal() # indicates source of request
 349         self.d_valid = Signal() # indicates req.data is valid now
 350
 351
 352 class MemAccessRequest(RecordObject):
 353     def __init__(self, name=None):
 354         super().__init__(name=name)
 355         self.op        = Signal(Op)
 356         self.valid     = Signal()
 357         self.dcbz      = Signal()
 358         self.real_addr = Signal(REAL_ADDR_BITS)
 359         self.data      = Signal(64)
 360         self.byte_sel  = Signal(8)
 361         self.hit_way   = Signal(WAY_BITS)
 362         self.same_tag  = Signal()
 363         self.mmu_req   = Signal()
 364
 365
 366 # First stage register, contains state for stage 1 of load hits
 367 # and for the state machine used by all other operations
 368 class RegStage1(RecordObject):
 369     def __init__(self, name=None):
 370         super().__init__(name=name)
 371         # Info about the request
 372         self.full             = Signal() # have uncompleted request
 373         self.mmu_req          = Signal() # request is from MMU
 374         self.req              = MemAccessRequest(name="reqmem")
 375
 376         # Cache hit state
 377         self.hit_way          = Signal(WAY_BITS)
 378         self.hit_load_valid   = Signal()
 379         self.hit_index        = Signal(INDEX_BITS)
 380         self.cache_hit        = Signal()
 381
 382         # TLB hit state
 383         self.tlb_hit          = TLBHit("tlb_hit")
 384         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 385
 386         # 2-stage data buffer for data forwarded from writes to reads
 387         self.forward_data1    = Signal(64)
 388         self.forward_data2    = Signal(64)
 389         self.forward_sel1     = Signal(8)
 390         self.forward_valid1   = Signal()
 391         self.forward_way1     = Signal(WAY_BITS)
 392         self.forward_row1     = Signal(ROW_BITS)
 393         self.use_forward1     = Signal()
 394         self.forward_sel      = Signal(8)
 395
 396         # Cache miss state (reload state machine)
 397         self.state            = Signal(State)
 398         self.dcbz             = Signal()
 399         self.write_bram       = Signal()
 400         self.write_tag        = Signal()
 401         self.slow_valid       = Signal()
 402         self.wb               = WBMasterOut("wb")
 403         self.reload_tag       = Signal(TAG_BITS)
 404         self.store_way        = Signal(WAY_BITS)
 405         self.store_row        = Signal(ROW_BITS)
 406         self.store_index      = Signal(INDEX_BITS)
 407         self.end_row_ix       = Signal(ROW_LINE_BITS)
 408         self.rows_valid       = RowPerLineValidArray()
 409         self.acks_pending     = Signal(3)
 410         self.inc_acks         = Signal()
 411         self.dec_acks         = Signal()
 412
 413         # Signals to complete (possibly with error)
 414         self.ls_valid         = Signal()
 415         self.ls_error         = Signal()
 416         self.mmu_done         = Signal()
 417         self.mmu_error        = Signal()
 418         self.cache_paradox    = Signal()
 419
 420         # Signal to complete a failed stcx.
 421         self.stcx_fail        = Signal()
 422
 423
 424 # Reservation information
 425 class Reservation(RecordObject):
 426     def __init__(self):
 427         super().__init__()
 428         self.valid = Signal()
 429         self.addr  = Signal(64-LINE_OFF_BITS)
 430
 431
 432 class DTLBUpdate(Elaboratable):
 433     def __init__(self):
 434         self.dtlb     = TLBArray()
 435         self.tlbie    = Signal()
 436         self.tlbwe    = Signal()
 437         self.doall    = Signal()
 438         self.tlb_hit     = TLBHit("tlb_hit")
 439         self.tlb_req_index = Signal(TLB_SET_BITS)
 440
 441         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 442         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 443         self.repl_way        = Signal(TLB_WAY_BITS)
 444         self.eatag           = Signal(TLB_EA_TAG_BITS)
 445         self.pte_data        = Signal(TLB_PTE_BITS)
 446
 447         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 448
 449         # read from dtlb array
 450         self.tlb_read       = Signal()
 451         self.tlb_read_index = Signal(TLB_SET_BITS)
 452         self.tlb_way        = TLBRecord("o_tlb_way")
 453
 454     def elaborate(self, platform):
 455         m = Module()
 456         comb = m.d.comb
 457         sync = m.d.sync
 458
 459         tagset   = Signal(TLB_TAG_WAY_BITS)
 460         pteset   = Signal(TLB_PTE_WAY_BITS)
 461         updated  = Signal()
 462         v_updated  = Signal()
 463         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 464         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 465         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 466
 467         dtlb, tlb_req_index = self.dtlb, self.tlb_req_index
 468         comb += db_out.eq(self.dv)
 469
 470         with m.If(self.tlbie & self.doall):
 471             # clear all valid bits at once
 472             for i in range(TLB_SET_SIZE):
 473                 sync += dtlb[i].valid.eq(0)
 474         with m.Elif(self.tlbie):
 475             with m.If(self.tlb_hit.valid):
 476                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 477                 comb += v_updated.eq(1)
 478
 479         with m.Elif(self.tlbwe):
 480
 481             comb += tagset.eq(self.tlb_tag_way)
 482             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 483             comb += tb_out.eq(tagset)
 484
 485             comb += pteset.eq(self.tlb_pte_way)
 486             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 487             comb += pb_out.eq(pteset)
 488
 489             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 490
 491             comb += updated.eq(1)
 492             comb += v_updated.eq(1)
 493
 494         with m.If(updated):
 495             sync += dtlb[tlb_req_index].tag.eq(tb_out)
 496             sync += dtlb[tlb_req_index].pte.eq(pb_out)
 497         with m.If(v_updated):
 498             sync += dtlb[tlb_req_index].valid.eq(db_out)
 499
 500         comb += self.dv.eq(dtlb[tlb_req_index].valid)
 501
 502         # select one TLB way
 503         with m.If(self.tlb_read):
 504             sync += self.tlb_way.eq(dtlb[self.tlb_read_index])
 505
 506         return m
 507
 508
 509 class DCachePendingHit(Elaboratable):
 510
 511     def __init__(self, tlb_way,
 512                       cache_i_validdx, cache_tag_set,
 513                     req_addr,
 514                     hit_set):
 515
 516         self.go          = Signal()
 517         self.virt_mode   = Signal()
 518         self.is_hit      = Signal()
 519         self.tlb_hit      = TLBHit("tlb_hit")
 520         self.hit_way     = Signal(WAY_BITS)
 521         self.rel_match   = Signal()
 522         self.req_index   = Signal(INDEX_BITS)
 523         self.reload_tag  = Signal(TAG_BITS)
 524
 525         self.tlb_way = tlb_way
 526         self.cache_i_validdx = cache_i_validdx
 527         self.cache_tag_set = cache_tag_set
 528         self.req_addr = req_addr
 529         self.hit_set = hit_set
 530
 531     def elaborate(self, platform):
 532         m = Module()
 533         comb = m.d.comb
 534         sync = m.d.sync
 535
 536         go = self.go
 537         virt_mode = self.virt_mode
 538         is_hit = self.is_hit
 539         tlb_way = self.tlb_way
 540         cache_i_validdx = self.cache_i_validdx
 541         cache_tag_set = self.cache_tag_set
 542         req_addr = self.req_addr
 543         tlb_hit = self.tlb_hit
 544         hit_set = self.hit_set
 545         hit_way = self.hit_way
 546         rel_match = self.rel_match
 547         req_index = self.req_index
 548         reload_tag = self.reload_tag
 549
 550         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 551                                     for i in range(TLB_NUM_WAYS))
 552         hit_way_set = HitWaySet()
 553
 554         # Test if pending request is a hit on any way
 555         # In order to make timing in virtual mode,
 556         # when we are using the TLB, we compare each
 557         # way with each of the real addresses from each way of
 558         # the TLB, and then decide later which match to use.
 559
 560         with m.If(virt_mode):
 561             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 562                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 563                 s_hit       = Signal()
 564                 s_pte       = Signal(TLB_PTE_BITS)
 565                 s_ra        = Signal(REAL_ADDR_BITS)
 566                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 567                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 568                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 569                 comb += s_tag.eq(get_tag(s_ra))
 570
 571                 for i in range(NUM_WAYS): # way_t
 572                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 573                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 574                                   (read_tag(i, cache_tag_set) == s_tag)
 575                                   & (tlb_way.valid[j]))
 576                     with m.If(is_tag_hit):
 577                         comb += hit_way_set[j].eq(i)
 578                         comb += s_hit.eq(1)
 579                 comb += hit_set[j].eq(s_hit)
 580                 with m.If(s_tag == reload_tag):
 581                     comb += rel_matches[j].eq(1)
 582             with m.If(tlb_hit.valid):
 583                 comb += is_hit.eq(hit_set[tlb_hit.way])
 584                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 585                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 586         with m.Else():
 587             s_tag       = Signal(TAG_BITS)
 588             comb += s_tag.eq(get_tag(req_addr))
 589             for i in range(NUM_WAYS): # way_t
 590                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 591                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 592                           (read_tag(i, cache_tag_set) == s_tag))
 593                 with m.If(is_tag_hit):
 594                     comb += hit_way.eq(i)
 595                     comb += is_hit.eq(1)
 596             with m.If(s_tag == reload_tag):
 597                 comb += rel_match.eq(1)
 598
 599         return m
 600
 601
 602 class DCache(Elaboratable):
 603     """Set associative dcache write-through
 604
 605     TODO (in no specific order):
 606     * See list in icache.vhdl
 607     * Complete load misses on the cycle when WB data comes instead of
 608       at the end of line (this requires dealing with requests coming in
 609       while not idle...)
 610     """
 611     def __init__(self):
 612         self.d_in      = LoadStore1ToDCacheType("d_in")
 613         self.d_out     = DCacheToLoadStore1Type("d_out")
 614
 615         self.m_in      = MMUToDCacheType("m_in")
 616         self.m_out     = DCacheToMMUType("m_out")
 617
 618         self.stall_out = Signal()
 619
 620         # standard naming (wired to non-standard for compatibility)
 621         self.bus = Interface(addr_width=32,
 622                             data_width=64,
 623                             granularity=8,
 624                             features={'stall'},
 625                             alignment=0,
 626                             name="dcache")
 627
 628         self.log_out   = Signal(20)
 629
 630     def stage_0(self, m, r0, r1, r0_full):
 631         """Latch the request in r0.req as long as we're not stalling
 632         """
 633         comb = m.d.comb
 634         sync = m.d.sync
 635         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 636
 637         r = RegStage0("stage0")
 638
 639         # TODO, this goes in unit tests and formal proofs
 640         with m.If(d_in.valid & m_in.valid):
 641             sync += Display("request collision loadstore vs MMU")
 642
 643         with m.If(m_in.valid):
 644             comb += r.req.valid.eq(1)
 645             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 646             comb += r.req.dcbz.eq(0)
 647             comb += r.req.nc.eq(0)
 648             comb += r.req.reserve.eq(0)
 649             comb += r.req.virt_mode.eq(0)
 650             comb += r.req.priv_mode.eq(1)
 651             comb += r.req.addr.eq(m_in.addr)
 652             comb += r.req.data.eq(m_in.pte)
 653             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 654             comb += r.tlbie.eq(m_in.tlbie)
 655             comb += r.doall.eq(m_in.doall)
 656             comb += r.tlbld.eq(m_in.tlbld)
 657             comb += r.mmu_req.eq(1)
 658             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 659                                  m_in.addr, m_in.pte, r.req.load)
 660
 661         with m.Else():
 662             comb += r.req.eq(d_in)
 663             comb += r.req.data.eq(0)
 664             comb += r.tlbie.eq(0)
 665             comb += r.doall.eq(0)
 666             comb += r.tlbld.eq(0)
 667             comb += r.mmu_req.eq(0)
 668         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 669             sync += r0.eq(r)
 670             sync += r0_full.eq(r.req.valid)
 671             # Sample data the cycle after a request comes in from loadstore1.
 672             # If another request has come in already then the data will get
 673             # put directly into req.data below.
 674             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 675                      ~r0.mmu_req):
 676                 sync += r0.req.data.eq(d_in.data)
 677                 sync += r0.d_valid.eq(1)
 678         with m.If(d_in.valid):
 679             m.d.sync += Display("    DCACHE req cache "
 680                                 "virt %d addr %x data %x ld %d",
 681                                  r.req.virt_mode, r.req.addr,
 682                                  r.req.data, r.req.load)
 683
 684     def tlb_read(self, m, r0_stall, tlb_way, dtlb):
 685         """TLB
 686         Operates in the second cycle on the request latched in r0.req.
 687         TLB updates write the entry at the end of the second cycle.
 688         """
 689         comb = m.d.comb
 690         sync = m.d.sync
 691         m_in, d_in = self.m_in, self.d_in
 692
 693         addrbits = Signal(TLB_SET_BITS)
 694
 695         amin = TLB_LG_PGSZ
 696         amax = TLB_LG_PGSZ + TLB_SET_BITS
 697
 698         with m.If(m_in.valid):
 699             comb += addrbits.eq(m_in.addr[amin : amax])
 700         with m.Else():
 701             comb += addrbits.eq(d_in.addr[amin : amax])
 702
 703         # If we have any op and the previous op isn't finished,
 704         # then keep the same output for next cycle.
 705         d = self.dtlb_update
 706         comb += d.tlb_read_index.eq(addrbits)
 707         comb += d.tlb_read.eq(~r0_stall)
 708         comb += tlb_way.eq(d.tlb_way)
 709
 710     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 711         """Generate TLB PLRUs
 712         """
 713         comb = m.d.comb
 714         sync = m.d.sync
 715
 716         if TLB_NUM_WAYS == 0:
 717             return
 718
 719         # Binary-to-Unary one-hot, enabled by tlb_hit valid
 720         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 721         m.submodules.tlb_plrus = tlb_plrus
 722         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 723         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 724         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 725         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 726         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 727
 728     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 729                    tlb_way,
 730                    pte, tlb_hit, valid_ra, perm_attr, ra):
 731
 732         comb = m.d.comb
 733
 734         hitway = Signal(TLB_WAY_BITS)
 735         hit    = Signal()
 736         eatag  = Signal(TLB_EA_TAG_BITS)
 737
 738         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 739         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 740         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 741
 742         for i in range(TLB_NUM_WAYS):
 743             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 744             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 745             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 746             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 747             with m.If(is_tag_hit):
 748                 comb += hitway.eq(i)
 749                 comb += hit.eq(1)
 750
 751         comb += tlb_hit.valid.eq(hit & r0_valid)
 752         comb += tlb_hit.way.eq(hitway)
 753
 754         with m.If(tlb_hit.valid):
 755             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 756         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 757
 758         with m.If(r0.req.virt_mode):
 759             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 760                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 761                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 762             comb += perm_attr.reference.eq(pte[8])
 763             comb += perm_attr.changed.eq(pte[7])
 764             comb += perm_attr.nocache.eq(pte[5])
 765             comb += perm_attr.priv.eq(pte[3])
 766             comb += perm_attr.rd_perm.eq(pte[2])
 767             comb += perm_attr.wr_perm.eq(pte[1])
 768         with m.Else():
 769             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 770                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 771             comb += perm_attr.reference.eq(1)
 772             comb += perm_attr.changed.eq(1)
 773             comb += perm_attr.nocache.eq(0)
 774             comb += perm_attr.priv.eq(1)
 775             comb += perm_attr.rd_perm.eq(1)
 776             comb += perm_attr.wr_perm.eq(1)
 777
 778         with m.If(valid_ra):
 779             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 780                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 781             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 782             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 783             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 784             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 785             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 786             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 787
 788     def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
 789                     tlb_hit, tlb_plru_victim, tlb_way):
 790
 791         comb = m.d.comb
 792         sync = m.d.sync
 793
 794         tlbie    = Signal()
 795         tlbwe    = Signal()
 796
 797         comb += tlbie.eq(r0_valid & r0.tlbie)
 798         comb += tlbwe.eq(r0_valid & r0.tlbld)
 799
 800         d = self.dtlb_update
 801
 802         comb += d.tlbie.eq(tlbie)
 803         comb += d.tlbwe.eq(tlbwe)
 804         comb += d.doall.eq(r0.doall)
 805         comb += d.tlb_hit.eq(tlb_hit)
 806         comb += d.tlb_tag_way.eq(tlb_way.tag)
 807         comb += d.tlb_pte_way.eq(tlb_way.pte)
 808         comb += d.tlb_req_index.eq(tlb_req_index)
 809
 810         with m.If(tlb_hit.valid):
 811             comb += d.repl_way.eq(tlb_hit.way)
 812         with m.Else():
 813             comb += d.repl_way.eq(tlb_plru_victim)
 814         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 815         comb += d.pte_data.eq(r0.req.data)
 816
 817     def maybe_plrus(self, m, r1, plru_victim):
 818         """Generate PLRUs
 819         """
 820         comb = m.d.comb
 821         sync = m.d.sync
 822
 823         if TLB_NUM_WAYS == 0:
 824             return
 825
 826         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 827         comb += plrus.way.eq(r1.hit_way)
 828         comb += plrus.valid.eq(r1.cache_hit)
 829         comb += plrus.index.eq(r1.hit_index)
 830         comb += plrus.isel.eq(r1.store_index) # select victim
 831         comb += plru_victim.eq(plrus.o_index) # selected victim
 832
 833     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 834         """Cache tag RAM read port
 835         """
 836         comb = m.d.comb
 837         sync = m.d.sync
 838         m_in, d_in = self.m_in, self.d_in
 839
 840         index = Signal(INDEX_BITS)
 841
 842         with m.If(r0_stall):
 843             comb += index.eq(req_index)
 844         with m.Elif(m_in.valid):
 845             comb += index.eq(get_index(m_in.addr))
 846         with m.Else():
 847             comb += index.eq(get_index(d_in.addr))
 848         sync += cache_tag_set.eq(cache_tags[index].tag)
 849
 850     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 851                        r0_valid, r1, cache_tags, replace_way,
 852                        use_forward1_next, use_forward2_next,
 853                        req_hit_way, plru_victim, rc_ok, perm_attr,
 854                        valid_ra, perm_ok, access_ok, req_op, req_go,
 855                        tlb_hit, tlb_way, cache_tag_set,
 856                        cancel_store, req_same_tag, r0_stall, early_req_row):
 857         """Cache request parsing and hit detection
 858         """
 859
 860         comb = m.d.comb
 861         m_in, d_in = self.m_in, self.d_in
 862
 863         is_hit      = Signal()
 864         hit_way     = Signal(WAY_BITS)
 865         op          = Signal(Op)
 866         opsel       = Signal(3)
 867         go          = Signal()
 868         nc          = Signal()
 869         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 870                                   for i in range(TLB_NUM_WAYS))
 871         cache_i_validdx = Signal(NUM_WAYS)
 872
 873         # Extract line, row and tag from request
 874         comb += req_index.eq(get_index(r0.req.addr))
 875         comb += req_row.eq(get_row(r0.req.addr))
 876         comb += req_tag.eq(get_tag(ra))
 877
 878         if False: # display on comb is a bit... busy.
 879             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 880                     r0.req.addr, ra, req_index, req_tag, req_row)
 881
 882         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 883         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 884
 885         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 886                                             cache_i_validdx, cache_tag_set,
 887                                             r0.req.addr,
 888                                             hit_set)
 889         comb += dc.tlb_hit.eq(tlb_hit)
 890         comb += dc.reload_tag.eq(r1.reload_tag)
 891         comb += dc.virt_mode.eq(r0.req.virt_mode)
 892         comb += dc.go.eq(go)
 893         comb += dc.req_index.eq(req_index)
 894
 895         comb += is_hit.eq(dc.is_hit)
 896         comb += hit_way.eq(dc.hit_way)
 897         comb += req_same_tag.eq(dc.rel_match)
 898
 899         # See if the request matches the line currently being reloaded
 900         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 901                   (req_index == r1.store_index) & req_same_tag):
 902             # For a store, consider this a hit even if the row isn't
 903             # valid since it will be by the time we perform the store.
 904             # For a load, check the appropriate row valid bit.
 905             rrow = Signal(ROW_LINE_BITS)
 906             comb += rrow.eq(req_row)
 907             valid = r1.rows_valid[rrow]
 908             comb += is_hit.eq((~r0.req.load) | valid)
 909             comb += hit_way.eq(replace_way)
 910
 911         # Whether to use forwarded data for a load or not
 912         with m.If((get_row(r1.req.real_addr) == req_row) &
 913                   (r1.req.hit_way == hit_way)):
 914             # Only need to consider r1.write_bram here, since if we
 915             # are writing refill data here, then we don't have a
 916             # cache hit this cycle on the line being refilled.
 917             # (There is the possibility that the load following the
 918             # load miss that started the refill could be to the old
 919             # contents of the victim line, since it is a couple of
 920             # cycles after the refill starts before we see the updated
 921             # cache tag. In that case we don't use the bypass.)
 922             comb += use_forward1_next.eq(r1.write_bram)
 923         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 924             comb += use_forward2_next.eq(r1.forward_valid1)
 925
 926         # The way that matched on a hit
 927         comb += req_hit_way.eq(hit_way)
 928
 929         # The way to replace on a miss
 930         with m.If(r1.write_tag):
 931             comb += replace_way.eq(plru_victim)
 932         with m.Else():
 933             comb += replace_way.eq(r1.store_way)
 934
 935         # work out whether we have permission for this access
 936         # NB we don't yet implement AMR, thus no KUAP
 937         comb += rc_ok.eq(perm_attr.reference
 938                          & (r0.req.load | perm_attr.changed))
 939         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 940                            (perm_attr.wr_perm |
 941                               (r0.req.load & perm_attr.rd_perm)))
 942         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 943
 944         # Combine the request and cache hit status to decide what
 945         # operation needs to be done
 946         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 947         comb += op.eq(Op.OP_NONE)
 948         with m.If(go):
 949             with m.If(~access_ok):
 950                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
 951                                  valid_ra, perm_ok, rc_ok)
 952                 comb += op.eq(Op.OP_BAD)
 953             with m.Elif(cancel_store):
 954                 m.d.sync += Display("DCACHE cancel store")
 955                 comb += op.eq(Op.OP_STCX_FAIL)
 956             with m.Else():
 957                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
 958                                  valid_ra, nc, r0.req.load)
 959                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 960                 with m.Switch(opsel):
 961                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 962                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 963                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 964                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 965                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 966                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 967                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 968                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 969         comb += req_op.eq(op)
 970         comb += req_go.eq(go)
 971
 972         # Version of the row number that is valid one cycle earlier
 973         # in the cases where we need to read the cache data BRAM.
 974         # If we're stalling then we need to keep reading the last
 975         # row requested.
 976         with m.If(~r0_stall):
 977             with m.If(m_in.valid):
 978                 comb += early_req_row.eq(get_row(m_in.addr))
 979             with m.Else():
 980                 comb += early_req_row.eq(get_row(d_in.addr))
 981         with m.Else():
 982             comb += early_req_row.eq(req_row)
 983
 984     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 985                          r0_valid, r0, reservation):
 986         """Handle load-with-reservation and store-conditional instructions
 987         """
 988         comb = m.d.comb
 989
 990         with m.If(r0_valid & r0.req.reserve):
 991             # XXX generate alignment interrupt if address
 992             # is not aligned XXX or if r0.req.nc = '1'
 993             with m.If(r0.req.load):
 994                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
 995             with m.Else():
 996                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
 997                 with m.If((~reservation.valid) |
 998                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 999                     comb += cancel_store.eq(1)
1000
1001     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1002                         reservation, r0):
1003         comb = m.d.comb
1004         sync = m.d.sync
1005
1006         with m.If(r0_valid & access_ok):
1007             with m.If(clear_rsrv):
1008                 sync += reservation.valid.eq(0)
1009             with m.Elif(set_rsrv):
1010                 sync += reservation.valid.eq(1)
1011                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1012
1013     def writeback_control(self, m, r1, cache_out_row):
1014         """Return data for loads & completion control logic
1015         """
1016         comb = m.d.comb
1017         sync = m.d.sync
1018         d_out, m_out = self.d_out, self.m_out
1019
1020         data_out = Signal(64)
1021         data_fwd = Signal(64)
1022
1023         # Use the bypass if are reading the row that was
1024         # written 1 or 2 cycles ago, including for the
1025         # slow_valid = 1 case (i.e. completing a load
1026         # miss or a non-cacheable load).
1027         with m.If(r1.use_forward1):
1028             comb += data_fwd.eq(r1.forward_data1)
1029         with m.Else():
1030             comb += data_fwd.eq(r1.forward_data2)
1031
1032         comb += data_out.eq(cache_out_row)
1033
1034         for i in range(8):
1035             with m.If(r1.forward_sel[i]):
1036                 dsel = data_fwd.word_select(i, 8)
1037                 comb += data_out.word_select(i, 8).eq(dsel)
1038
1039         # DCache output to LoadStore
1040         comb += d_out.valid.eq(r1.ls_valid)
1041         comb += d_out.data.eq(data_out)
1042         comb += d_out.store_done.eq(~r1.stcx_fail)
1043         comb += d_out.error.eq(r1.ls_error)
1044         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1045
1046         # Outputs to MMU
1047         comb += m_out.done.eq(r1.mmu_done)
1048         comb += m_out.err.eq(r1.mmu_error)
1049         comb += m_out.data.eq(data_out)
1050
1051         # We have a valid load or store hit or we just completed
1052         # a slow op such as a load miss, a NC load or a store
1053         #
1054         # Note: the load hit is delayed by one cycle. However it
1055         # can still not collide with r.slow_valid (well unless I
1056         # miscalculated) because slow_valid can only be set on a
1057         # subsequent request and not on its first cycle (the state
1058         # machine must have advanced), which makes slow_valid
1059         # at least 2 cycles from the previous hit_load_valid.
1060
1061         # Sanity: Only one of these must be set in any given cycle
1062
1063         if False: # TODO: need Display to get this to work
1064             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1065             "unexpected slow_valid collision with stcx_fail"
1066
1067             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1068              "unexpected hit_load_delayed collision with slow_valid"
1069
1070         with m.If(~r1.mmu_req):
1071             # Request came from loadstore1...
1072             # Load hit case is the standard path
1073             with m.If(r1.hit_load_valid):
1074                 sync += Display("completing load hit data=%x", data_out)
1075
1076             # error cases complete without stalling
1077             with m.If(r1.ls_error):
1078                 with m.If(r1.dcbz):
1079                     sync += Display("completing dcbz with error")
1080                 with m.Else():
1081                     sync += Display("completing ld/st with error")
1082
1083             # Slow ops (load miss, NC, stores)
1084             with m.If(r1.slow_valid):
1085                 sync += Display("completing store or load miss adr=%x data=%x",
1086                                 r1.req.real_addr, data_out)
1087
1088         with m.Else():
1089             # Request came from MMU
1090             with m.If(r1.hit_load_valid):
1091                 sync += Display("completing load hit to MMU, data=%x",
1092                                 m_out.data)
1093             # error cases complete without stalling
1094             with m.If(r1.mmu_error):
1095                 sync += Display("combpleting MMU ld with error")
1096
1097             # Slow ops (i.e. load miss)
1098             with m.If(r1.slow_valid):
1099                 sync += Display("completing MMU load miss, adr=%x data=%x",
1100                                 r1.req.real_addr, m_out.data)
1101
1102     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1103         """rams
1104         Generate a cache RAM for each way. This handles the normal
1105         reads, writes from reloads and the special store-hit update
1106         path as well.
1107
1108         Note: the BRAMs have an extra read buffer, meaning the output
1109         is pipelined an extra cycle. This differs from the
1110         icache. The writeback logic needs to take that into
1111         account by using 1-cycle delayed signals for load hits.
1112         """
1113         comb = m.d.comb
1114         bus = self.bus
1115
1116         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1117         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1118         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1119         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1120                    ~r1.write_bram))
1121         comb += rwe.i.eq(replace_way)
1122
1123         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1124         comb += hwe.i.eq(r1.hit_way)
1125
1126         # this one is gated with write_bram, and replace_way_e can never be
1127         # set at the same time.  that means that do_write can OR the outputs
1128         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1129         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1130         comb += hre.i.eq(r1.req.hit_way)
1131
1132         # common Signals
1133         do_read  = Signal()
1134         wr_addr  = Signal(ROW_BITS)
1135         wr_data  = Signal(WB_DATA_BITS)
1136         wr_sel   = Signal(ROW_SIZE)
1137         rd_addr  = Signal(ROW_BITS)
1138
1139         comb += do_read.eq(1) # always enable
1140         comb += rd_addr.eq(early_req_row)
1141
1142         # Write mux:
1143         #
1144         # Defaults to wishbone read responses (cache refill)
1145         #
1146         # For timing, the mux on wr_data/sel/addr is not
1147         # dependent on anything other than the current state.
1148
1149         with m.If(r1.write_bram):
1150             # Write store data to BRAM.  This happens one
1151             # cycle after the store is in r0.
1152             comb += wr_data.eq(r1.req.data)
1153             comb += wr_sel.eq(r1.req.byte_sel)
1154             comb += wr_addr.eq(get_row(r1.req.real_addr))
1155
1156         with m.Else():
1157             # Otherwise, we might be doing a reload or a DCBZ
1158             with m.If(r1.dcbz):
1159                 comb += wr_data.eq(0)
1160             with m.Else():
1161                 comb += wr_data.eq(bus.dat_r)
1162             comb += wr_addr.eq(r1.store_row)
1163             comb += wr_sel.eq(~0) # all 1s
1164
1165         # set up Cache Rams
1166         for i in range(NUM_WAYS):
1167             do_write = Signal(name="do_wr%d" % i)
1168             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1169             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1170
1171             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1172             setattr(m.submodules, "cacheram_%d" % i, way)
1173
1174             comb += way.rd_en.eq(do_read)
1175             comb += way.rd_addr.eq(rd_addr)
1176             comb += d_out.eq(way.rd_data_o)
1177             comb += way.wr_sel.eq(wr_sel_m)
1178             comb += way.wr_addr.eq(wr_addr)
1179             comb += way.wr_data.eq(wr_data)
1180
1181             # Cache hit reads
1182             with m.If(hwe.o[i]):
1183                 comb += cache_out_row.eq(d_out)
1184
1185             # these are mutually-exclusive via their Decoder-enablers
1186             # (note: Decoder-enable is inverted)
1187             comb += do_write.eq(hre.o[i] | rwe.o[i])
1188
1189             # Mask write selects with do_write since BRAM
1190             # doesn't have a global write-enable
1191             with m.If(do_write):
1192                 comb += wr_sel_m.eq(wr_sel)
1193
1194     # Cache hit synchronous machine for the easy case.
1195     # This handles load hits.
1196     # It also handles error cases (TLB miss, cache paradox)
1197     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1198                         req_hit_way, req_index, req_tag, access_ok,
1199                         tlb_hit, tlb_req_index):
1200         comb = m.d.comb
1201         sync = m.d.sync
1202
1203         with m.If(req_op != Op.OP_NONE):
1204             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1205                     req_op, r0.req.addr, r0.req.nc,
1206                     req_index, req_tag, req_hit_way)
1207
1208         with m.If(r0_valid):
1209             sync += r1.mmu_req.eq(r0.mmu_req)
1210
1211         # Fast path for load/store hits.
1212         # Set signals for the writeback controls.
1213         sync += r1.hit_way.eq(req_hit_way)
1214         sync += r1.hit_index.eq(req_index)
1215
1216         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1217         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1218                                 (req_op == Op.OP_STORE_HIT))
1219
1220         with m.If(req_op == Op.OP_BAD):
1221             sync += Display("Signalling ld/st error "
1222                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1223                             ~r0.mmu_req,r0.mmu_req,access_ok)
1224             sync += r1.ls_error.eq(~r0.mmu_req)
1225             sync += r1.mmu_error.eq(r0.mmu_req)
1226             sync += r1.cache_paradox.eq(access_ok)
1227         with m.Else():
1228             sync += r1.ls_error.eq(0)
1229             sync += r1.mmu_error.eq(0)
1230             sync += r1.cache_paradox.eq(0)
1231
1232         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1233
1234         # Record TLB hit information for updating TLB PLRU
1235         sync += r1.tlb_hit.eq(tlb_hit)
1236         sync += r1.tlb_hit_index.eq(tlb_req_index)
1237
1238     # Memory accesses are handled by this state machine:
1239     #
1240     #   * Cache load miss/reload (in conjunction with "rams")
1241     #   * Load hits for non-cachable forms
1242     #   * Stores (the collision case is handled in "rams")
1243     #
1244     # All wishbone requests generation is done here.
1245     # This machine operates at stage 1.
1246     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1247                     r0, replace_way,
1248                     req_hit_way, req_same_tag,
1249                     r0_valid, req_op, cache_tags, req_go, ra):
1250
1251         comb = m.d.comb
1252         sync = m.d.sync
1253         bus = self.bus
1254         d_in = self.d_in
1255
1256         req         = MemAccessRequest("mreq_ds")
1257
1258         req_row = Signal(ROW_BITS)
1259         req_idx = Signal(INDEX_BITS)
1260         req_tag = Signal(TAG_BITS)
1261         comb += req_idx.eq(get_index(req.real_addr))
1262         comb += req_row.eq(get_row(req.real_addr))
1263         comb += req_tag.eq(get_tag(req.real_addr))
1264
1265         sync += r1.use_forward1.eq(use_forward1_next)
1266         sync += r1.forward_sel.eq(0)
1267
1268         with m.If(use_forward1_next):
1269             sync += r1.forward_sel.eq(r1.req.byte_sel)
1270         with m.Elif(use_forward2_next):
1271             sync += r1.forward_sel.eq(r1.forward_sel1)
1272
1273         sync += r1.forward_data2.eq(r1.forward_data1)
1274         with m.If(r1.write_bram):
1275             sync += r1.forward_data1.eq(r1.req.data)
1276             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1277             sync += r1.forward_way1.eq(r1.req.hit_way)
1278             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1279             sync += r1.forward_valid1.eq(1)
1280         with m.Else():
1281             with m.If(r1.dcbz):
1282                 sync += r1.forward_data1.eq(0)
1283             with m.Else():
1284                 sync += r1.forward_data1.eq(bus.dat_r)
1285             sync += r1.forward_sel1.eq(~0) # all 1s
1286             sync += r1.forward_way1.eq(replace_way)
1287             sync += r1.forward_row1.eq(r1.store_row)
1288             sync += r1.forward_valid1.eq(0)
1289
1290         # One cycle pulses reset
1291         sync += r1.slow_valid.eq(0)
1292         sync += r1.write_bram.eq(0)
1293         sync += r1.inc_acks.eq(0)
1294         sync += r1.dec_acks.eq(0)
1295
1296         sync += r1.ls_valid.eq(0)
1297         # complete tlbies and TLB loads in the third cycle
1298         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1299
1300         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1301             with m.If(~r0.mmu_req):
1302                 sync += r1.ls_valid.eq(1)
1303             with m.Else():
1304                 sync += r1.mmu_done.eq(1)
1305
1306         with m.If(r1.write_tag):
1307             # Store new tag in selected way
1308             replace_way_onehot = Signal(NUM_WAYS)
1309             comb += replace_way_onehot.eq(1<<replace_way)
1310             for i in range(NUM_WAYS):
1311                 with m.If(replace_way_onehot[i]):
1312                     ct = Signal(TAG_RAM_WIDTH)
1313                     comb += ct.eq(cache_tags[r1.store_index].tag)
1314                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1315                     sync += cache_tags[r1.store_index].tag.eq(ct)
1316             sync += r1.store_way.eq(replace_way)
1317             sync += r1.write_tag.eq(0)
1318
1319         # Take request from r1.req if there is one there,
1320         # else from req_op, ra, etc.
1321         with m.If(r1.full):
1322             comb += req.eq(r1.req)
1323         with m.Else():
1324             comb += req.op.eq(req_op)
1325             comb += req.valid.eq(req_go)
1326             comb += req.mmu_req.eq(r0.mmu_req)
1327             comb += req.dcbz.eq(r0.req.dcbz)
1328             comb += req.real_addr.eq(ra)
1329
1330             with m.If(r0.req.dcbz):
1331                 # force data to 0 for dcbz
1332                 comb += req.data.eq(0)
1333             with m.Elif(r0.d_valid):
1334                 comb += req.data.eq(r0.req.data)
1335             with m.Else():
1336                 comb += req.data.eq(d_in.data)
1337
1338             # Select all bytes for dcbz
1339             # and for cacheable loads
1340             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1341                 comb += req.byte_sel.eq(~0) # all 1s
1342             with m.Else():
1343                 comb += req.byte_sel.eq(r0.req.byte_sel)
1344             comb += req.hit_way.eq(req_hit_way)
1345             comb += req.same_tag.eq(req_same_tag)
1346
1347             # Store the incoming request from r0,
1348             # if it is a slow request
1349             # Note that r1.full = 1 implies req_op = OP_NONE
1350             with m.If((req_op == Op.OP_LOAD_MISS)
1351                       | (req_op == Op.OP_LOAD_NC)
1352                       | (req_op == Op.OP_STORE_MISS)
1353                       | (req_op == Op.OP_STORE_HIT)):
1354                 sync += r1.req.eq(req)
1355                 sync += r1.full.eq(1)
1356
1357         # Main state machine
1358         with m.Switch(r1.state):
1359
1360             with m.Case(State.IDLE):
1361                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1362                 sync += r1.wb.sel.eq(req.byte_sel)
1363                 sync += r1.wb.dat.eq(req.data)
1364                 sync += r1.dcbz.eq(req.dcbz)
1365
1366                 # Keep track of our index and way
1367                 # for subsequent stores.
1368                 sync += r1.store_index.eq(req_idx)
1369                 sync += r1.store_row.eq(req_row)
1370                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1371                 sync += r1.reload_tag.eq(req_tag)
1372                 sync += r1.req.same_tag.eq(1)
1373
1374                 with m.If(req.op == Op.OP_STORE_HIT):
1375                     sync += r1.store_way.eq(req.hit_way)
1376
1377                 # Reset per-row valid bits,
1378                 # ready for handling OP_LOAD_MISS
1379                 for i in range(ROW_PER_LINE):
1380                     sync += r1.rows_valid[i].eq(0)
1381
1382                 with m.If(req_op != Op.OP_NONE):
1383                     sync += Display("cache op %d", req.op)
1384
1385                 with m.Switch(req.op):
1386                     with m.Case(Op.OP_LOAD_HIT):
1387                         # stay in IDLE state
1388                         pass
1389
1390                     with m.Case(Op.OP_LOAD_MISS):
1391                         sync += Display("cache miss real addr: %x " \
1392                                 "idx: %x tag: %x",
1393                                 req.real_addr, req_row, req_tag)
1394
1395                         # Start the wishbone cycle
1396                         sync += r1.wb.we.eq(0)
1397                         sync += r1.wb.cyc.eq(1)
1398                         sync += r1.wb.stb.eq(1)
1399
1400                         # Track that we had one request sent
1401                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1402                         sync += r1.write_tag.eq(1)
1403
1404                     with m.Case(Op.OP_LOAD_NC):
1405                         sync += r1.wb.cyc.eq(1)
1406                         sync += r1.wb.stb.eq(1)
1407                         sync += r1.wb.we.eq(0)
1408                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1409
1410                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1411                         with m.If(~req.dcbz):
1412                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1413                             sync += r1.acks_pending.eq(1)
1414                             sync += r1.full.eq(0)
1415                             sync += r1.slow_valid.eq(1)
1416
1417                             with m.If(~req.mmu_req):
1418                                 sync += r1.ls_valid.eq(1)
1419                             with m.Else():
1420                                 sync += r1.mmu_done.eq(1)
1421
1422                             with m.If(req.op == Op.OP_STORE_HIT):
1423                                 sync += r1.write_bram.eq(1)
1424                         with m.Else():
1425                             # dcbz is handled much like a load miss except
1426                             # that we are writing to memory instead of reading
1427                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1428
1429                             with m.If(req.op == Op.OP_STORE_MISS):
1430                                 sync += r1.write_tag.eq(1)
1431
1432                         sync += r1.wb.we.eq(1)
1433                         sync += r1.wb.cyc.eq(1)
1434                         sync += r1.wb.stb.eq(1)
1435
1436                     # OP_NONE and OP_BAD do nothing
1437                     # OP_BAD & OP_STCX_FAIL were
1438                     # handled above already
1439                     with m.Case(Op.OP_NONE):
1440                         pass
1441                     with m.Case(Op.OP_BAD):
1442                         pass
1443                     with m.Case(Op.OP_STCX_FAIL):
1444                         pass
1445
1446             with m.Case(State.RELOAD_WAIT_ACK):
1447                 ld_stbs_done = Signal()
1448                 # Requests are all sent if stb is 0
1449                 comb += ld_stbs_done.eq(~r1.wb.stb)
1450
1451                 # If we are still sending requests, was one accepted?
1452                 with m.If((~bus.stall) & r1.wb.stb):
1453                     # That was the last word?  We are done sending.
1454                     # Clear stb and set ld_stbs_done so we can handle an
1455                     # eventual last ack on the same cycle.
1456                     # sigh - reconstruct wb adr with 3 extra 0s at front
1457                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1458                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1459                         sync += r1.wb.stb.eq(0)
1460                         comb += ld_stbs_done.eq(1)
1461
1462                     # Calculate the next row address in the current cache line
1463                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1464                     comb += row.eq(r1.wb.adr)
1465                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1466
1467                 # Incoming acks processing
1468                 sync += r1.forward_valid1.eq(bus.ack)
1469                 with m.If(bus.ack):
1470                     srow = Signal(ROW_LINE_BITS)
1471                     comb += srow.eq(r1.store_row)
1472                     sync += r1.rows_valid[srow].eq(1)
1473
1474                     # If this is the data we were looking for,
1475                     # we can complete the request next cycle.
1476                     # Compare the whole address in case the
1477                     # request in r1.req is not the one that
1478                     # started this refill.
1479                     with m.If(req.valid & r1.req.same_tag &
1480                               ((r1.dcbz & r1.req.dcbz) |
1481                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1482                                 (r1.store_row == get_row(req.real_addr))):
1483                         sync += r1.full.eq(0)
1484                         sync += r1.slow_valid.eq(1)
1485                         with m.If(~r1.mmu_req):
1486                             sync += r1.ls_valid.eq(1)
1487                         with m.Else():
1488                             sync += r1.mmu_done.eq(1)
1489                         sync += r1.forward_sel.eq(~0) # all 1s
1490                         sync += r1.use_forward1.eq(1)
1491
1492                     # Check for completion
1493                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1494                                                       r1.end_row_ix)):
1495                         # Complete wishbone cycle
1496                         sync += r1.wb.cyc.eq(0)
1497
1498                         # Cache line is now valid
1499                         cv = Signal(INDEX_BITS)
1500                         comb += cv.eq(cache_tags[r1.store_index].valid)
1501                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1502                         sync += cache_tags[r1.store_index].valid.eq(cv)
1503
1504                         sync += r1.state.eq(State.IDLE)
1505                         sync += Display("cache valid set %x "
1506                                         "idx %d way %d",
1507                                          cv, r1.store_index, r1.store_way)
1508
1509                     # Increment store row counter
1510                     sync += r1.store_row.eq(next_row(r1.store_row))
1511
1512             with m.Case(State.STORE_WAIT_ACK):
1513                 st_stbs_done = Signal()
1514                 acks        = Signal(3)
1515                 adjust_acks = Signal(3)
1516
1517                 comb += st_stbs_done.eq(~r1.wb.stb)
1518                 comb += acks.eq(r1.acks_pending)
1519
1520                 with m.If(r1.inc_acks != r1.dec_acks):
1521                     with m.If(r1.inc_acks):
1522                         comb += adjust_acks.eq(acks + 1)
1523                     with m.Else():
1524                         comb += adjust_acks.eq(acks - 1)
1525                 with m.Else():
1526                     comb += adjust_acks.eq(acks)
1527
1528                 sync += r1.acks_pending.eq(adjust_acks)
1529
1530                 # Clear stb when slave accepted request
1531                 with m.If(~bus.stall):
1532                     # See if there is another store waiting
1533                     # to be done which is in the same real page.
1534                     with m.If(req.valid):
1535                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1536                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1537                         sync += r1.wb.dat.eq(req.data)
1538                         sync += r1.wb.sel.eq(req.byte_sel)
1539
1540                     with m.If((adjust_acks < 7) & req.same_tag &
1541                                 ((req.op == Op.OP_STORE_MISS)
1542                                  | (req.op == Op.OP_STORE_HIT))):
1543                         sync += r1.wb.stb.eq(1)
1544                         comb += st_stbs_done.eq(0)
1545
1546                         with m.If(req.op == Op.OP_STORE_HIT):
1547                             sync += r1.write_bram.eq(1)
1548                         sync += r1.full.eq(0)
1549                         sync += r1.slow_valid.eq(1)
1550
1551                         # Store requests never come from the MMU
1552                         sync += r1.ls_valid.eq(1)
1553                         comb += st_stbs_done.eq(0)
1554                         sync += r1.inc_acks.eq(1)
1555                     with m.Else():
1556                         sync += r1.wb.stb.eq(0)
1557                         comb += st_stbs_done.eq(1)
1558
1559                 # Got ack ? See if complete.
1560                 with m.If(bus.ack):
1561                     with m.If(st_stbs_done & (adjust_acks == 1)):
1562                         sync += r1.state.eq(State.IDLE)
1563                         sync += r1.wb.cyc.eq(0)
1564                         sync += r1.wb.stb.eq(0)
1565                     sync += r1.dec_acks.eq(1)
1566
1567             with m.Case(State.NC_LOAD_WAIT_ACK):
1568                 # Clear stb when slave accepted request
1569                 with m.If(~bus.stall):
1570                     sync += r1.wb.stb.eq(0)
1571
1572                 # Got ack ? complete.
1573                 with m.If(bus.ack):
1574                     sync += r1.state.eq(State.IDLE)
1575                     sync += r1.full.eq(0)
1576                     sync += r1.slow_valid.eq(1)
1577
1578                     with m.If(~r1.mmu_req):
1579                         sync += r1.ls_valid.eq(1)
1580                     with m.Else():
1581                         sync += r1.mmu_done.eq(1)
1582
1583                     sync += r1.forward_sel.eq(~0) # all 1s
1584                     sync += r1.use_forward1.eq(1)
1585                     sync += r1.wb.cyc.eq(0)
1586                     sync += r1.wb.stb.eq(0)
1587
1588     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1589
1590         sync = m.d.sync
1591         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1592
1593         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1594                                stall_out, req_op[:3], d_out.valid, d_out.error,
1595                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1596                                r1.real_adr[3:6]))
1597
1598     def elaborate(self, platform):
1599
1600         m = Module()
1601         comb = m.d.comb
1602         d_in = self.d_in
1603
1604         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1605         cache_tags       = CacheTagArray()
1606         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1607
1608         # TODO attribute ram_style : string;
1609         # TODO attribute ram_style of cache_tags : signal is "distributed";
1610
1611         """note: these are passed to nmigen.hdl.Memory as "attributes".
1612            don't know how, just that they are.
1613         """
1614         # TODO attribute ram_style of
1615         #  dtlb_tags : signal is "distributed";
1616         # TODO attribute ram_style of
1617         #  dtlb_ptes : signal is "distributed";
1618
1619         r0      = RegStage0("r0")
1620         r0_full = Signal()
1621
1622         r1 = RegStage1("r1")
1623
1624         reservation = Reservation()
1625
1626         # Async signals on incoming request
1627         req_index    = Signal(INDEX_BITS)
1628         req_row      = Signal(ROW_BITS)
1629         req_hit_way  = Signal(WAY_BITS)
1630         req_tag      = Signal(TAG_BITS)
1631         req_op       = Signal(Op)
1632         req_data     = Signal(64)
1633         req_same_tag = Signal()
1634         req_go       = Signal()
1635
1636         early_req_row     = Signal(ROW_BITS)
1637
1638         cancel_store      = Signal()
1639         set_rsrv          = Signal()
1640         clear_rsrv        = Signal()
1641
1642         r0_valid          = Signal()
1643         r0_stall          = Signal()
1644
1645         use_forward1_next = Signal()
1646         use_forward2_next = Signal()
1647
1648         cache_out_row     = Signal(WB_DATA_BITS)
1649
1650         plru_victim       = Signal(WAY_BITS)
1651         replace_way       = Signal(WAY_BITS)
1652
1653         # Wishbone read/write/cache write formatting signals
1654         bus_sel           = Signal(8)
1655
1656         # TLB signals
1657         tlb_way       = TLBRecord("tlb_way")
1658         tlb_req_index = Signal(TLB_SET_BITS)
1659         tlb_hit       = TLBHit("tlb_hit")
1660         pte           = Signal(TLB_PTE_BITS)
1661         ra            = Signal(REAL_ADDR_BITS)
1662         valid_ra      = Signal()
1663         perm_attr     = PermAttr("dc_perms")
1664         rc_ok         = Signal()
1665         perm_ok       = Signal()
1666         access_ok     = Signal()
1667
1668         tlb_plru_victim = Signal(TLB_WAY_BITS)
1669
1670         # we don't yet handle collisions between loadstore1 requests
1671         # and MMU requests
1672         comb += self.m_out.stall.eq(0)
1673
1674         # Hold off the request in r0 when r1 has an uncompleted request
1675         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1676         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1677         comb += self.stall_out.eq(r0_stall)
1678
1679         # deal with litex not doing wishbone pipeline mode
1680         # XXX in wrong way.  FIFOs are needed in the SRAM test
1681         # so that stb/ack match up. same thing done in icache.py
1682         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1683
1684         # Wire up wishbone request latch out of stage 1
1685         comb += self.bus.we.eq(r1.wb.we)
1686         comb += self.bus.adr.eq(r1.wb.adr)
1687         comb += self.bus.sel.eq(r1.wb.sel)
1688         comb += self.bus.stb.eq(r1.wb.stb)
1689         comb += self.bus.dat_w.eq(r1.wb.dat)
1690         comb += self.bus.cyc.eq(r1.wb.cyc)
1691
1692         # create submodule TLBUpdate
1693         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1694         dtlb = self.dtlb_update.dtlb
1695
1696         # call sub-functions putting everything together, using shared
1697         # signals established above
1698         self.stage_0(m, r0, r1, r0_full)
1699         self.tlb_read(m, r0_stall, tlb_way, dtlb)
1700         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1701                         tlb_way,
1702                         pte, tlb_hit, valid_ra, perm_attr, ra)
1703         self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
1704                         tlb_hit, tlb_plru_victim,
1705                         tlb_way)
1706         self.maybe_plrus(m, r1, plru_victim)
1707         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1708         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1709         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1710                            r0_valid, r1, cache_tags, replace_way,
1711                            use_forward1_next, use_forward2_next,
1712                            req_hit_way, plru_victim, rc_ok, perm_attr,
1713                            valid_ra, perm_ok, access_ok, req_op, req_go,
1714                            tlb_hit, tlb_way, cache_tag_set,
1715                            cancel_store, req_same_tag, r0_stall, early_req_row)
1716         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1717                            r0_valid, r0, reservation)
1718         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1719                            reservation, r0)
1720         self.writeback_control(m, r1, cache_out_row)
1721         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1722         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1723                         req_hit_way, req_index, req_tag, access_ok,
1724                         tlb_hit, tlb_req_index)
1725         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1726                     r0, replace_way,
1727                     req_hit_way, req_same_tag,
1728                          r0_valid, req_op, cache_tags, req_go, ra)
1729         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1730
1731         return m
1732
1733
1734 if __name__ == '__main__':
1735     dut = DCache()
1736     vl = rtlil.convert(dut, ports=[])
1737     with open("test_dcache.il", "w") as f:
1738         f.write(vl)