src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record)
  29 from nmutil.util import Display
  30 from nmigen.lib.coding import Decoder
  31
  32 from copy import deepcopy
  33 from random import randint, seed
  34
  35 from nmigen_soc.wishbone.bus import Interface
  36
  37 from nmigen.cli import main
  38 from nmutil.iocontrol import RecordObject
  39 from nmigen.utils import log2_int
  40 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  41                                      DCacheToLoadStore1Type,
  42                                      MMUToDCacheType,
  43                                      DCacheToMMUType)
  44
  45 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  46                                 WBAddrType, WBDataType, WBSelType,
  47                                 WBMasterOut, WBSlaveOut,
  48                                 WBMasterOutVector, WBSlaveOutVector,
  49                                 WBIOMasterOut, WBIOSlaveOut)
  50
  51 from soc.experiment.cache_ram import CacheRam
  52 #from soc.experiment.plru import PLRU
  53 from nmutil.plru import PLRU, PLRUs
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmigen.cli import rtlil
  59
  60 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  61 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  62 from nmutil.sim_tmp_alternative import Simulator
  63
  64 from nmutil.util import wrap
  65
  66
  67 # TODO: make these parameters of DCache at some point
  68 LINE_SIZE = 64    # Line size in bytes
  69 NUM_LINES = 16    # Number of lines in a set
  70 NUM_WAYS = 4      # Number of ways
  71 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  72 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  73 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  74 LOG_LENGTH = 0    # Non-zero to enable log data collection
  75
  76 # BRAM organisation: We never access more than
  77 #     -- WB_DATA_BITS at a time so to save
  78 #     -- resources we make the array only that wide, and
  79 #     -- use consecutive indices to make a cache "line"
  80 #     --
  81 #     -- ROW_SIZE is the width in bytes of the BRAM
  82 #     -- (based on WB, so 64-bits)
  83 ROW_SIZE = WB_DATA_BITS // 8;
  84
  85 # ROW_PER_LINE is the number of row (wishbone
  86 # transactions) in a line
  87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  88
  89 # BRAM_ROWS is the number of rows in BRAM needed
  90 # to represent the full dcache
  91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  92
  93 print ("ROW_SIZE", ROW_SIZE)
  94 print ("ROW_PER_LINE", ROW_PER_LINE)
  95 print ("BRAM_ROWS", BRAM_ROWS)
  96 print ("NUM_WAYS", NUM_WAYS)
  97
  98 # Bit fields counts in the address
  99
 100 # REAL_ADDR_BITS is the number of real address
 101 # bits that we store
 102 REAL_ADDR_BITS = 56
 103
 104 # ROW_BITS is the number of bits to select a row
 105 ROW_BITS = log2_int(BRAM_ROWS)
 106
 107 # ROW_LINE_BITS is the number of bits to select
 108 # a row within a line
 109 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 110
 111 # LINE_OFF_BITS is the number of bits for
 112 # the offset in a cache line
 113 LINE_OFF_BITS = log2_int(LINE_SIZE)
 114
 115 # ROW_OFF_BITS is the number of bits for
 116 # the offset in a row
 117 ROW_OFF_BITS = log2_int(ROW_SIZE)
 118
 119 # INDEX_BITS is the number if bits to
 120 # select a cache line
 121 INDEX_BITS = log2_int(NUM_LINES)
 122
 123 # SET_SIZE_BITS is the log base 2 of the set size
 124 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 125
 126 # TAG_BITS is the number of bits of
 127 # the tag part of the address
 128 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 129
 130 # TAG_WIDTH is the width in bits of each way of the tag RAM
 131 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 132
 133 # WAY_BITS is the number of bits to select a way
 134 WAY_BITS = log2_int(NUM_WAYS)
 135
 136 # Example of layout for 32 lines of 64 bytes:
 137 layout = """\
 138   ..  tag    |index|  line  |
 139   ..         |   row   |    |
 140   ..         |     |---|    | ROW_LINE_BITS  (3)
 141   ..         |     |--- - --| LINE_OFF_BITS (6)
 142   ..         |         |- --| ROW_OFF_BITS  (3)
 143   ..         |----- ---|    | ROW_BITS      (8)
 144   ..         |-----|        | INDEX_BITS    (5)
 145   .. --------|              | TAG_BITS      (45)
 146 """
 147 print (layout)
 148 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 149             (TAG_BITS, INDEX_BITS, ROW_BITS,
 150              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 151 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 152 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 153 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 154
 155 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 156
 157 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 158
 159 def CacheTagArray():
 160     tag_layout = [('valid', 1),
 161                   ('tag', TAG_RAM_WIDTH),
 162                  ]
 163     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 164
 165 def RowPerLineValidArray():
 166     return Array(Signal(name="rows_valid%d" % x) \
 167                         for x in range(ROW_PER_LINE))
 168
 169 # L1 TLB
 170 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 171 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 172 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 173 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 174 TLB_PTE_BITS     = 64
 175 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 176
 177 def ispow2(x):
 178     return (1<<log2_int(x, False)) == x
 179
 180 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 181 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 182 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 183 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 184 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 185 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 186         "geometry bits don't add up"
 187 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 188         "geometry bits don't add up"
 189 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 190          "geometry bits don't add up"
 191 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 192 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 193
 194 def TLBHit(name):
 195     return Record([('valid', 1),
 196                    ('way', TLB_WAY_BITS)], name=name)
 197
 198 def TLBTagEAArray():
 199     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 200                 for x in range (TLB_NUM_WAYS))
 201
 202 def TLBRecord(name):
 203     tlb_layout = [('valid', TLB_NUM_WAYS),
 204                   ('tag', TLB_TAG_WAY_BITS),
 205                   ('pte', TLB_PTE_WAY_BITS)
 206                  ]
 207     return Record(tlb_layout, name=name)
 208
 209 def TLBArray():
 210     return Array(TLBRecord(name="tlb%d" % x) for x in range(TLB_SET_SIZE))
 211
 212 def HitWaySet():
 213     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 214                         for x in range(TLB_NUM_WAYS))
 215
 216 # Cache RAM interface
 217 def CacheRamOut():
 218     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 219                  for x in range(NUM_WAYS))
 220
 221 # PLRU output interface
 222 def PLRUOut():
 223     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 224                 for x in range(NUM_LINES))
 225
 226 # TLB PLRU output interface
 227 def TLBPLRUOut():
 228     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 229                 for x in range(TLB_SET_SIZE))
 230
 231 # Helper functions to decode incoming requests
 232 #
 233 # Return the cache line index (tag index) for an address
 234 def get_index(addr):
 235     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 236
 237 # Return the cache row index (data memory) for an address
 238 def get_row(addr):
 239     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 240
 241 # Return the index of a row within a line
 242 def get_row_of_line(row):
 243     return row[:ROW_BITS][:ROW_LINE_BITS]
 244
 245 # Returns whether this is the last row of a line
 246 def is_last_row_addr(addr, last):
 247     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 248
 249 # Returns whether this is the last row of a line
 250 def is_last_row(row, last):
 251     return get_row_of_line(row) == last
 252
 253 # Return the next row in the current cache line. We use a
 254 # dedicated function in order to limit the size of the
 255 # generated adder to be only the bits within a cache line
 256 # (3 bits with default settings)
 257 def next_row(row):
 258     row_v = row[0:ROW_LINE_BITS] + 1
 259     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 260
 261 # Get the tag value from the address
 262 def get_tag(addr):
 263     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 264
 265 # Read a tag from a tag memory row
 266 def read_tag(way, tagset):
 267     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 268
 269 # Read a TLB tag from a TLB tag memory row
 270 def read_tlb_tag(way, tags):
 271     return tags.word_select(way, TLB_EA_TAG_BITS)
 272
 273 # Write a TLB tag to a TLB tag memory row
 274 def write_tlb_tag(way, tags, tag):
 275     return read_tlb_tag(way, tags).eq(tag)
 276
 277 # Read a PTE from a TLB PTE memory row
 278 def read_tlb_pte(way, ptes):
 279     return ptes.word_select(way, TLB_PTE_BITS)
 280
 281 def write_tlb_pte(way, ptes, newpte):
 282     return read_tlb_pte(way, ptes).eq(newpte)
 283
 284
 285 # Record for storing permission, attribute, etc. bits from a PTE
 286 class PermAttr(RecordObject):
 287     def __init__(self, name=None):
 288         super().__init__(name=name)
 289         self.reference = Signal()
 290         self.changed   = Signal()
 291         self.nocache   = Signal()
 292         self.priv      = Signal()
 293         self.rd_perm   = Signal()
 294         self.wr_perm   = Signal()
 295
 296
 297 def extract_perm_attr(pte):
 298     pa = PermAttr()
 299     return pa;
 300
 301
 302 # Type of operation on a "valid" input
 303 @unique
 304 class Op(Enum):
 305     OP_NONE       = 0
 306     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 307     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 308     OP_LOAD_HIT   = 3 # Cache hit on load
 309     OP_LOAD_MISS  = 4 # Load missing cache
 310     OP_LOAD_NC    = 5 # Non-cachable load
 311     OP_STORE_HIT  = 6 # Store hitting cache
 312     OP_STORE_MISS = 7 # Store missing cache
 313
 314
 315 # Cache state machine
 316 @unique
 317 class State(Enum):
 318     IDLE             = 0 # Normal load hit processing
 319     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 320     STORE_WAIT_ACK   = 2 # Store wait ack
 321     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 322
 323
 324 # Dcache operations:
 325 #
 326 # In order to make timing, we use the BRAMs with
 327 # an output buffer, which means that the BRAM
 328 # output is delayed by an extra cycle.
 329 #
 330 # Thus, the dcache has a 2-stage internal pipeline
 331 # for cache hits with no stalls.
 332 #
 333 # All other operations are handled via stalling
 334 # in the first stage.
 335 #
 336 # The second stage can thus complete a hit at the same
 337 # time as the first stage emits a stall for a complex op.
 338 #
 339 # Stage 0 register, basically contains just the latched request
 340
 341 class RegStage0(RecordObject):
 342     def __init__(self, name=None):
 343         super().__init__(name=name)
 344         self.req     = LoadStore1ToDCacheType(name="lsmem")
 345         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 346         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 347         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 348         self.mmu_req = Signal() # indicates source of request
 349         self.d_valid = Signal() # indicates req.data is valid now
 350
 351
 352 class MemAccessRequest(RecordObject):
 353     def __init__(self, name=None):
 354         super().__init__(name=name)
 355         self.op        = Signal(Op)
 356         self.valid     = Signal()
 357         self.dcbz      = Signal()
 358         self.real_addr = Signal(REAL_ADDR_BITS)
 359         self.data      = Signal(64)
 360         self.byte_sel  = Signal(8)
 361         self.hit_way   = Signal(WAY_BITS)
 362         self.same_tag  = Signal()
 363         self.mmu_req   = Signal()
 364
 365
 366 # First stage register, contains state for stage 1 of load hits
 367 # and for the state machine used by all other operations
 368 class RegStage1(RecordObject):
 369     def __init__(self, name=None):
 370         super().__init__(name=name)
 371         # Info about the request
 372         self.full             = Signal() # have uncompleted request
 373         self.mmu_req          = Signal() # request is from MMU
 374         self.req              = MemAccessRequest(name="reqmem")
 375
 376         # Cache hit state
 377         self.hit_way          = Signal(WAY_BITS)
 378         self.hit_load_valid   = Signal()
 379         self.hit_index        = Signal(INDEX_BITS)
 380         self.cache_hit        = Signal()
 381
 382         # TLB hit state
 383         self.tlb_hit          = TLBHit("tlb_hit")
 384         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 385
 386         # 2-stage data buffer for data forwarded from writes to reads
 387         self.forward_data1    = Signal(64)
 388         self.forward_data2    = Signal(64)
 389         self.forward_sel1     = Signal(8)
 390         self.forward_valid1   = Signal()
 391         self.forward_way1     = Signal(WAY_BITS)
 392         self.forward_row1     = Signal(ROW_BITS)
 393         self.use_forward1     = Signal()
 394         self.forward_sel      = Signal(8)
 395
 396         # Cache miss state (reload state machine)
 397         self.state            = Signal(State)
 398         self.dcbz             = Signal()
 399         self.write_bram       = Signal()
 400         self.write_tag        = Signal()
 401         self.slow_valid       = Signal()
 402         self.wb               = WBMasterOut("wb")
 403         self.reload_tag       = Signal(TAG_BITS)
 404         self.store_way        = Signal(WAY_BITS)
 405         self.store_row        = Signal(ROW_BITS)
 406         self.store_index      = Signal(INDEX_BITS)
 407         self.end_row_ix       = Signal(ROW_LINE_BITS)
 408         self.rows_valid       = RowPerLineValidArray()
 409         self.acks_pending     = Signal(3)
 410         self.inc_acks         = Signal()
 411         self.dec_acks         = Signal()
 412
 413         # Signals to complete (possibly with error)
 414         self.ls_valid         = Signal()
 415         self.ls_error         = Signal()
 416         self.mmu_done         = Signal()
 417         self.mmu_error        = Signal()
 418         self.cache_paradox    = Signal()
 419
 420         # Signal to complete a failed stcx.
 421         self.stcx_fail        = Signal()
 422
 423
 424 # Reservation information
 425 class Reservation(RecordObject):
 426     def __init__(self):
 427         super().__init__()
 428         self.valid = Signal()
 429         self.addr  = Signal(64-LINE_OFF_BITS)
 430
 431
 432 class DTLBUpdate(Elaboratable):
 433     def __init__(self):
 434         self.dtlb     = TLBArray()
 435         self.tlbie    = Signal()
 436         self.tlbwe    = Signal()
 437         self.doall    = Signal()
 438         self.tlb_hit     = TLBHit("tlb_hit")
 439         self.tlb_req_index = Signal(TLB_SET_BITS)
 440
 441         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 442         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 443         self.repl_way        = Signal(TLB_WAY_BITS)
 444         self.eatag           = Signal(TLB_EA_TAG_BITS)
 445         self.pte_data        = Signal(TLB_PTE_BITS)
 446
 447         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 448
 449         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 450         self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 451         self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 452
 453         # read from dtlb array
 454         self.tlb_read       = Signal()
 455         self.tlb_read_index = Signal(TLB_SET_BITS)
 456         self.tlb_way        = TLBRecord("o_tlb_way")
 457
 458     def elaborate(self, platform):
 459         m = Module()
 460         comb = m.d.comb
 461         sync = m.d.sync
 462
 463         tagset   = Signal(TLB_TAG_WAY_BITS)
 464         pteset   = Signal(TLB_PTE_WAY_BITS)
 465         updated  = Signal()
 466         v_updated  = Signal()
 467
 468         dtlb, tlb_req_index = self.dtlb, self.tlb_req_index
 469         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 470         comb += db_out.eq(self.dv)
 471
 472         with m.If(self.tlbie & self.doall):
 473             # clear all valid bits at once
 474             for i in range(TLB_SET_SIZE):
 475                 sync += dtlb[i].valid.eq(0)
 476         with m.Elif(self.tlbie):
 477             with m.If(self.tlb_hit.valid):
 478                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 479                 comb += v_updated.eq(1)
 480
 481         with m.Elif(self.tlbwe):
 482
 483             comb += tagset.eq(self.tlb_tag_way)
 484             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 485             comb += tb_out.eq(tagset)
 486
 487             comb += pteset.eq(self.tlb_pte_way)
 488             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 489             comb += pb_out.eq(pteset)
 490
 491             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 492
 493             comb += updated.eq(1)
 494             comb += v_updated.eq(1)
 495
 496         with m.If(updated):
 497             sync += dtlb[tlb_req_index].tag.eq(self.tb_out)
 498             sync += dtlb[tlb_req_index].pte.eq(self.pb_out)
 499         with m.If(v_updated):
 500             sync += dtlb[tlb_req_index].valid.eq(self.db_out)
 501
 502         comb += self.dv.eq(dtlb[tlb_req_index].valid)
 503
 504         # select one TLB way
 505         with m.If(self.tlb_read):
 506             sync += self.tlb_way.eq(dtlb[self.tlb_read_index])
 507
 508         return m
 509
 510
 511 class DCachePendingHit(Elaboratable):
 512
 513     def __init__(self, tlb_way,
 514                       cache_i_validdx, cache_tag_set,
 515                     req_addr,
 516                     hit_set):
 517
 518         self.go          = Signal()
 519         self.virt_mode   = Signal()
 520         self.is_hit      = Signal()
 521         self.tlb_hit      = TLBHit("tlb_hit")
 522         self.hit_way     = Signal(WAY_BITS)
 523         self.rel_match   = Signal()
 524         self.req_index   = Signal(INDEX_BITS)
 525         self.reload_tag  = Signal(TAG_BITS)
 526
 527         self.tlb_way = tlb_way
 528         self.cache_i_validdx = cache_i_validdx
 529         self.cache_tag_set = cache_tag_set
 530         self.req_addr = req_addr
 531         self.hit_set = hit_set
 532
 533     def elaborate(self, platform):
 534         m = Module()
 535         comb = m.d.comb
 536         sync = m.d.sync
 537
 538         go = self.go
 539         virt_mode = self.virt_mode
 540         is_hit = self.is_hit
 541         tlb_way = self.tlb_way
 542         cache_i_validdx = self.cache_i_validdx
 543         cache_tag_set = self.cache_tag_set
 544         req_addr = self.req_addr
 545         tlb_hit = self.tlb_hit
 546         hit_set = self.hit_set
 547         hit_way = self.hit_way
 548         rel_match = self.rel_match
 549         req_index = self.req_index
 550         reload_tag = self.reload_tag
 551
 552         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 553                                     for i in range(TLB_NUM_WAYS))
 554         hit_way_set = HitWaySet()
 555
 556         # Test if pending request is a hit on any way
 557         # In order to make timing in virtual mode,
 558         # when we are using the TLB, we compare each
 559         # way with each of the real addresses from each way of
 560         # the TLB, and then decide later which match to use.
 561
 562         with m.If(virt_mode):
 563             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 564                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 565                 s_hit       = Signal()
 566                 s_pte       = Signal(TLB_PTE_BITS)
 567                 s_ra        = Signal(REAL_ADDR_BITS)
 568                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 569                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 570                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 571                 comb += s_tag.eq(get_tag(s_ra))
 572
 573                 for i in range(NUM_WAYS): # way_t
 574                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 575                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 576                                   (read_tag(i, cache_tag_set) == s_tag)
 577                                   & (tlb_way.valid[j]))
 578                     with m.If(is_tag_hit):
 579                         comb += hit_way_set[j].eq(i)
 580                         comb += s_hit.eq(1)
 581                 comb += hit_set[j].eq(s_hit)
 582                 with m.If(s_tag == reload_tag):
 583                     comb += rel_matches[j].eq(1)
 584             with m.If(tlb_hit.valid):
 585                 comb += is_hit.eq(hit_set[tlb_hit.way])
 586                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 587                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 588         with m.Else():
 589             s_tag       = Signal(TAG_BITS)
 590             comb += s_tag.eq(get_tag(req_addr))
 591             for i in range(NUM_WAYS): # way_t
 592                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 593                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 594                           (read_tag(i, cache_tag_set) == s_tag))
 595                 with m.If(is_tag_hit):
 596                     comb += hit_way.eq(i)
 597                     comb += is_hit.eq(1)
 598             with m.If(s_tag == reload_tag):
 599                 comb += rel_match.eq(1)
 600
 601         return m
 602
 603
 604 class DCache(Elaboratable):
 605     """Set associative dcache write-through
 606
 607     TODO (in no specific order):
 608     * See list in icache.vhdl
 609     * Complete load misses on the cycle when WB data comes instead of
 610       at the end of line (this requires dealing with requests coming in
 611       while not idle...)
 612     """
 613     def __init__(self):
 614         self.d_in      = LoadStore1ToDCacheType("d_in")
 615         self.d_out     = DCacheToLoadStore1Type("d_out")
 616
 617         self.m_in      = MMUToDCacheType("m_in")
 618         self.m_out     = DCacheToMMUType("m_out")
 619
 620         self.stall_out = Signal()
 621
 622         # standard naming (wired to non-standard for compatibility)
 623         self.bus = Interface(addr_width=32,
 624                             data_width=64,
 625                             granularity=8,
 626                             features={'stall'},
 627                             alignment=0,
 628                             name="dcache")
 629
 630         self.log_out   = Signal(20)
 631
 632     def stage_0(self, m, r0, r1, r0_full):
 633         """Latch the request in r0.req as long as we're not stalling
 634         """
 635         comb = m.d.comb
 636         sync = m.d.sync
 637         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 638
 639         r = RegStage0("stage0")
 640
 641         # TODO, this goes in unit tests and formal proofs
 642         with m.If(d_in.valid & m_in.valid):
 643             sync += Display("request collision loadstore vs MMU")
 644
 645         with m.If(m_in.valid):
 646             comb += r.req.valid.eq(1)
 647             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 648             comb += r.req.dcbz.eq(0)
 649             comb += r.req.nc.eq(0)
 650             comb += r.req.reserve.eq(0)
 651             comb += r.req.virt_mode.eq(0)
 652             comb += r.req.priv_mode.eq(1)
 653             comb += r.req.addr.eq(m_in.addr)
 654             comb += r.req.data.eq(m_in.pte)
 655             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 656             comb += r.tlbie.eq(m_in.tlbie)
 657             comb += r.doall.eq(m_in.doall)
 658             comb += r.tlbld.eq(m_in.tlbld)
 659             comb += r.mmu_req.eq(1)
 660             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 661                                  m_in.addr, m_in.pte, r.req.load)
 662
 663         with m.Else():
 664             comb += r.req.eq(d_in)
 665             comb += r.req.data.eq(0)
 666             comb += r.tlbie.eq(0)
 667             comb += r.doall.eq(0)
 668             comb += r.tlbld.eq(0)
 669             comb += r.mmu_req.eq(0)
 670         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 671             sync += r0.eq(r)
 672             sync += r0_full.eq(r.req.valid)
 673             # Sample data the cycle after a request comes in from loadstore1.
 674             # If another request has come in already then the data will get
 675             # put directly into req.data below.
 676             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 677                      ~r0.mmu_req):
 678                 sync += r0.req.data.eq(d_in.data)
 679                 sync += r0.d_valid.eq(1)
 680         with m.If(d_in.valid):
 681             m.d.sync += Display("    DCACHE req cache "
 682                                 "virt %d addr %x data %x ld %d",
 683                                  r.req.virt_mode, r.req.addr,
 684                                  r.req.data, r.req.load)
 685
 686     def tlb_read(self, m, r0_stall, tlb_way, dtlb):
 687         """TLB
 688         Operates in the second cycle on the request latched in r0.req.
 689         TLB updates write the entry at the end of the second cycle.
 690         """
 691         comb = m.d.comb
 692         sync = m.d.sync
 693         m_in, d_in = self.m_in, self.d_in
 694
 695         addrbits = Signal(TLB_SET_BITS)
 696
 697         amin = TLB_LG_PGSZ
 698         amax = TLB_LG_PGSZ + TLB_SET_BITS
 699
 700         with m.If(m_in.valid):
 701             comb += addrbits.eq(m_in.addr[amin : amax])
 702         with m.Else():
 703             comb += addrbits.eq(d_in.addr[amin : amax])
 704
 705         # If we have any op and the previous op isn't finished,
 706         # then keep the same output for next cycle.
 707         d = self.dtlb_update
 708         comb += d.tlb_read_index.eq(addrbits)
 709         comb += d.tlb_read.eq(~r0_stall)
 710         comb += tlb_way.eq(d.tlb_way)
 711
 712     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 713         """Generate TLB PLRUs
 714         """
 715         comb = m.d.comb
 716         sync = m.d.sync
 717
 718         if TLB_NUM_WAYS == 0:
 719             return
 720
 721         # Binary-to-Unary one-hot, enabled by tlb_hit valid
 722         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 723         m.submodules.tlb_plrus = tlb_plrus
 724         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 725         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 726         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 727         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 728         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 729
 730     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 731                    tlb_way,
 732                    pte, tlb_hit, valid_ra, perm_attr, ra):
 733
 734         comb = m.d.comb
 735
 736         hitway = Signal(TLB_WAY_BITS)
 737         hit    = Signal()
 738         eatag  = Signal(TLB_EA_TAG_BITS)
 739
 740         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 741         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 742         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 743
 744         for i in range(TLB_NUM_WAYS):
 745             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 746             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 747             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 748             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 749             with m.If(is_tag_hit):
 750                 comb += hitway.eq(i)
 751                 comb += hit.eq(1)
 752
 753         comb += tlb_hit.valid.eq(hit & r0_valid)
 754         comb += tlb_hit.way.eq(hitway)
 755
 756         with m.If(tlb_hit.valid):
 757             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 758         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 759
 760         with m.If(r0.req.virt_mode):
 761             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 762                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 763                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 764             comb += perm_attr.reference.eq(pte[8])
 765             comb += perm_attr.changed.eq(pte[7])
 766             comb += perm_attr.nocache.eq(pte[5])
 767             comb += perm_attr.priv.eq(pte[3])
 768             comb += perm_attr.rd_perm.eq(pte[2])
 769             comb += perm_attr.wr_perm.eq(pte[1])
 770         with m.Else():
 771             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 772                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 773             comb += perm_attr.reference.eq(1)
 774             comb += perm_attr.changed.eq(1)
 775             comb += perm_attr.nocache.eq(0)
 776             comb += perm_attr.priv.eq(1)
 777             comb += perm_attr.rd_perm.eq(1)
 778             comb += perm_attr.wr_perm.eq(1)
 779
 780         with m.If(valid_ra):
 781             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 782                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 783             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 784             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 785             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 786             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 787             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 788             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 789
 790     def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
 791                     tlb_hit, tlb_plru_victim, tlb_way):
 792
 793         comb = m.d.comb
 794         sync = m.d.sync
 795
 796         tlbie    = Signal()
 797         tlbwe    = Signal()
 798
 799         comb += tlbie.eq(r0_valid & r0.tlbie)
 800         comb += tlbwe.eq(r0_valid & r0.tlbld)
 801
 802         d = self.dtlb_update
 803
 804         comb += d.tlbie.eq(tlbie)
 805         comb += d.tlbwe.eq(tlbwe)
 806         comb += d.doall.eq(r0.doall)
 807         comb += d.tlb_hit.eq(tlb_hit)
 808         comb += d.tlb_tag_way.eq(tlb_way.tag)
 809         comb += d.tlb_pte_way.eq(tlb_way.pte)
 810         comb += d.tlb_req_index.eq(tlb_req_index)
 811
 812         with m.If(tlb_hit.valid):
 813             comb += d.repl_way.eq(tlb_hit.way)
 814         with m.Else():
 815             comb += d.repl_way.eq(tlb_plru_victim)
 816         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 817         comb += d.pte_data.eq(r0.req.data)
 818
 819     def maybe_plrus(self, m, r1, plru_victim):
 820         """Generate PLRUs
 821         """
 822         comb = m.d.comb
 823         sync = m.d.sync
 824
 825         if TLB_NUM_WAYS == 0:
 826             return
 827
 828         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 829         comb += plrus.way.eq(r1.hit_way)
 830         comb += plrus.valid.eq(r1.cache_hit)
 831         comb += plrus.index.eq(r1.hit_index)
 832         comb += plrus.isel.eq(r1.store_index) # select victim
 833         comb += plru_victim.eq(plrus.o_index) # selected victim
 834
 835     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 836         """Cache tag RAM read port
 837         """
 838         comb = m.d.comb
 839         sync = m.d.sync
 840         m_in, d_in = self.m_in, self.d_in
 841
 842         index = Signal(INDEX_BITS)
 843
 844         with m.If(r0_stall):
 845             comb += index.eq(req_index)
 846         with m.Elif(m_in.valid):
 847             comb += index.eq(get_index(m_in.addr))
 848         with m.Else():
 849             comb += index.eq(get_index(d_in.addr))
 850         sync += cache_tag_set.eq(cache_tags[index].tag)
 851
 852     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 853                        r0_valid, r1, cache_tags, replace_way,
 854                        use_forward1_next, use_forward2_next,
 855                        req_hit_way, plru_victim, rc_ok, perm_attr,
 856                        valid_ra, perm_ok, access_ok, req_op, req_go,
 857                        tlb_hit, tlb_way, cache_tag_set,
 858                        cancel_store, req_same_tag, r0_stall, early_req_row):
 859         """Cache request parsing and hit detection
 860         """
 861
 862         comb = m.d.comb
 863         m_in, d_in = self.m_in, self.d_in
 864
 865         is_hit      = Signal()
 866         hit_way     = Signal(WAY_BITS)
 867         op          = Signal(Op)
 868         opsel       = Signal(3)
 869         go          = Signal()
 870         nc          = Signal()
 871         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 872                                   for i in range(TLB_NUM_WAYS))
 873         cache_i_validdx = Signal(NUM_WAYS)
 874
 875         # Extract line, row and tag from request
 876         comb += req_index.eq(get_index(r0.req.addr))
 877         comb += req_row.eq(get_row(r0.req.addr))
 878         comb += req_tag.eq(get_tag(ra))
 879
 880         if False: # display on comb is a bit... busy.
 881             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 882                     r0.req.addr, ra, req_index, req_tag, req_row)
 883
 884         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 885         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 886
 887         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 888                                             cache_i_validdx, cache_tag_set,
 889                                             r0.req.addr,
 890                                             hit_set)
 891         comb += dc.tlb_hit.eq(tlb_hit)
 892         comb += dc.reload_tag.eq(r1.reload_tag)
 893         comb += dc.virt_mode.eq(r0.req.virt_mode)
 894         comb += dc.go.eq(go)
 895         comb += dc.req_index.eq(req_index)
 896
 897         comb += is_hit.eq(dc.is_hit)
 898         comb += hit_way.eq(dc.hit_way)
 899         comb += req_same_tag.eq(dc.rel_match)
 900
 901         # See if the request matches the line currently being reloaded
 902         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 903                   (req_index == r1.store_index) & req_same_tag):
 904             # For a store, consider this a hit even if the row isn't
 905             # valid since it will be by the time we perform the store.
 906             # For a load, check the appropriate row valid bit.
 907             rrow = Signal(ROW_LINE_BITS)
 908             comb += rrow.eq(req_row)
 909             valid = r1.rows_valid[rrow]
 910             comb += is_hit.eq((~r0.req.load) | valid)
 911             comb += hit_way.eq(replace_way)
 912
 913         # Whether to use forwarded data for a load or not
 914         with m.If((get_row(r1.req.real_addr) == req_row) &
 915                   (r1.req.hit_way == hit_way)):
 916             # Only need to consider r1.write_bram here, since if we
 917             # are writing refill data here, then we don't have a
 918             # cache hit this cycle on the line being refilled.
 919             # (There is the possibility that the load following the
 920             # load miss that started the refill could be to the old
 921             # contents of the victim line, since it is a couple of
 922             # cycles after the refill starts before we see the updated
 923             # cache tag. In that case we don't use the bypass.)
 924             comb += use_forward1_next.eq(r1.write_bram)
 925         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 926             comb += use_forward2_next.eq(r1.forward_valid1)
 927
 928         # The way that matched on a hit
 929         comb += req_hit_way.eq(hit_way)
 930
 931         # The way to replace on a miss
 932         with m.If(r1.write_tag):
 933             comb += replace_way.eq(plru_victim)
 934         with m.Else():
 935             comb += replace_way.eq(r1.store_way)
 936
 937         # work out whether we have permission for this access
 938         # NB we don't yet implement AMR, thus no KUAP
 939         comb += rc_ok.eq(perm_attr.reference
 940                          & (r0.req.load | perm_attr.changed))
 941         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 942                            (perm_attr.wr_perm |
 943                               (r0.req.load & perm_attr.rd_perm)))
 944         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 945
 946         # Combine the request and cache hit status to decide what
 947         # operation needs to be done
 948         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 949         comb += op.eq(Op.OP_NONE)
 950         with m.If(go):
 951             with m.If(~access_ok):
 952                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
 953                                  valid_ra, perm_ok, rc_ok)
 954                 comb += op.eq(Op.OP_BAD)
 955             with m.Elif(cancel_store):
 956                 m.d.sync += Display("DCACHE cancel store")
 957                 comb += op.eq(Op.OP_STCX_FAIL)
 958             with m.Else():
 959                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
 960                                  valid_ra, nc, r0.req.load)
 961                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 962                 with m.Switch(opsel):
 963                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 964                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 965                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 966                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 967                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 968                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 969                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 970                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 971         comb += req_op.eq(op)
 972         comb += req_go.eq(go)
 973
 974         # Version of the row number that is valid one cycle earlier
 975         # in the cases where we need to read the cache data BRAM.
 976         # If we're stalling then we need to keep reading the last
 977         # row requested.
 978         with m.If(~r0_stall):
 979             with m.If(m_in.valid):
 980                 comb += early_req_row.eq(get_row(m_in.addr))
 981             with m.Else():
 982                 comb += early_req_row.eq(get_row(d_in.addr))
 983         with m.Else():
 984             comb += early_req_row.eq(req_row)
 985
 986     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 987                          r0_valid, r0, reservation):
 988         """Handle load-with-reservation and store-conditional instructions
 989         """
 990         comb = m.d.comb
 991
 992         with m.If(r0_valid & r0.req.reserve):
 993             # XXX generate alignment interrupt if address
 994             # is not aligned XXX or if r0.req.nc = '1'
 995             with m.If(r0.req.load):
 996                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
 997             with m.Else():
 998                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
 999                 with m.If((~reservation.valid) |
1000                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1001                     comb += cancel_store.eq(1)
1002
1003     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1004                         reservation, r0):
1005         comb = m.d.comb
1006         sync = m.d.sync
1007
1008         with m.If(r0_valid & access_ok):
1009             with m.If(clear_rsrv):
1010                 sync += reservation.valid.eq(0)
1011             with m.Elif(set_rsrv):
1012                 sync += reservation.valid.eq(1)
1013                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1014
1015     def writeback_control(self, m, r1, cache_out_row):
1016         """Return data for loads & completion control logic
1017         """
1018         comb = m.d.comb
1019         sync = m.d.sync
1020         d_out, m_out = self.d_out, self.m_out
1021
1022         data_out = Signal(64)
1023         data_fwd = Signal(64)
1024
1025         # Use the bypass if are reading the row that was
1026         # written 1 or 2 cycles ago, including for the
1027         # slow_valid = 1 case (i.e. completing a load
1028         # miss or a non-cacheable load).
1029         with m.If(r1.use_forward1):
1030             comb += data_fwd.eq(r1.forward_data1)
1031         with m.Else():
1032             comb += data_fwd.eq(r1.forward_data2)
1033
1034         comb += data_out.eq(cache_out_row)
1035
1036         for i in range(8):
1037             with m.If(r1.forward_sel[i]):
1038                 dsel = data_fwd.word_select(i, 8)
1039                 comb += data_out.word_select(i, 8).eq(dsel)
1040
1041         # DCache output to LoadStore
1042         comb += d_out.valid.eq(r1.ls_valid)
1043         comb += d_out.data.eq(data_out)
1044         comb += d_out.store_done.eq(~r1.stcx_fail)
1045         comb += d_out.error.eq(r1.ls_error)
1046         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1047
1048         # Outputs to MMU
1049         comb += m_out.done.eq(r1.mmu_done)
1050         comb += m_out.err.eq(r1.mmu_error)
1051         comb += m_out.data.eq(data_out)
1052
1053         # We have a valid load or store hit or we just completed
1054         # a slow op such as a load miss, a NC load or a store
1055         #
1056         # Note: the load hit is delayed by one cycle. However it
1057         # can still not collide with r.slow_valid (well unless I
1058         # miscalculated) because slow_valid can only be set on a
1059         # subsequent request and not on its first cycle (the state
1060         # machine must have advanced), which makes slow_valid
1061         # at least 2 cycles from the previous hit_load_valid.
1062
1063         # Sanity: Only one of these must be set in any given cycle
1064
1065         if False: # TODO: need Display to get this to work
1066             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1067             "unexpected slow_valid collision with stcx_fail"
1068
1069             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1070              "unexpected hit_load_delayed collision with slow_valid"
1071
1072         with m.If(~r1.mmu_req):
1073             # Request came from loadstore1...
1074             # Load hit case is the standard path
1075             with m.If(r1.hit_load_valid):
1076                 sync += Display("completing load hit data=%x", data_out)
1077
1078             # error cases complete without stalling
1079             with m.If(r1.ls_error):
1080                 with m.If(r1.dcbz):
1081                     sync += Display("completing dcbz with error")
1082                 with m.Else():
1083                     sync += Display("completing ld/st with error")
1084
1085             # Slow ops (load miss, NC, stores)
1086             with m.If(r1.slow_valid):
1087                 sync += Display("completing store or load miss adr=%x data=%x",
1088                                 r1.req.real_addr, data_out)
1089
1090         with m.Else():
1091             # Request came from MMU
1092             with m.If(r1.hit_load_valid):
1093                 sync += Display("completing load hit to MMU, data=%x",
1094                                 m_out.data)
1095             # error cases complete without stalling
1096             with m.If(r1.mmu_error):
1097                 sync += Display("combpleting MMU ld with error")
1098
1099             # Slow ops (i.e. load miss)
1100             with m.If(r1.slow_valid):
1101                 sync += Display("completing MMU load miss, adr=%x data=%x",
1102                                 r1.req.real_addr, m_out.data)
1103
1104     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1105         """rams
1106         Generate a cache RAM for each way. This handles the normal
1107         reads, writes from reloads and the special store-hit update
1108         path as well.
1109
1110         Note: the BRAMs have an extra read buffer, meaning the output
1111         is pipelined an extra cycle. This differs from the
1112         icache. The writeback logic needs to take that into
1113         account by using 1-cycle delayed signals for load hits.
1114         """
1115         comb = m.d.comb
1116         bus = self.bus
1117
1118         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1119         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1120         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1121         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1122                    ~r1.write_bram))
1123         comb += rwe.i.eq(replace_way)
1124
1125         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1126         comb += hwe.i.eq(r1.hit_way)
1127
1128         # this one is gated with write_bram, and replace_way_e can never be
1129         # set at the same time.  that means that do_write can OR the outputs
1130         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1131         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1132         comb += hre.i.eq(r1.req.hit_way)
1133
1134         # common Signals
1135         do_read  = Signal()
1136         wr_addr  = Signal(ROW_BITS)
1137         wr_data  = Signal(WB_DATA_BITS)
1138         wr_sel   = Signal(ROW_SIZE)
1139         rd_addr  = Signal(ROW_BITS)
1140
1141         comb += do_read.eq(1) # always enable
1142         comb += rd_addr.eq(early_req_row)
1143
1144         # Write mux:
1145         #
1146         # Defaults to wishbone read responses (cache refill)
1147         #
1148         # For timing, the mux on wr_data/sel/addr is not
1149         # dependent on anything other than the current state.
1150
1151         with m.If(r1.write_bram):
1152             # Write store data to BRAM.  This happens one
1153             # cycle after the store is in r0.
1154             comb += wr_data.eq(r1.req.data)
1155             comb += wr_sel.eq(r1.req.byte_sel)
1156             comb += wr_addr.eq(get_row(r1.req.real_addr))
1157
1158         with m.Else():
1159             # Otherwise, we might be doing a reload or a DCBZ
1160             with m.If(r1.dcbz):
1161                 comb += wr_data.eq(0)
1162             with m.Else():
1163                 comb += wr_data.eq(bus.dat_r)
1164             comb += wr_addr.eq(r1.store_row)
1165             comb += wr_sel.eq(~0) # all 1s
1166
1167         # set up Cache Rams
1168         for i in range(NUM_WAYS):
1169             do_write = Signal(name="do_wr%d" % i)
1170             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1171             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1172
1173             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1174             setattr(m.submodules, "cacheram_%d" % i, way)
1175
1176             comb += way.rd_en.eq(do_read)
1177             comb += way.rd_addr.eq(rd_addr)
1178             comb += d_out.eq(way.rd_data_o)
1179             comb += way.wr_sel.eq(wr_sel_m)
1180             comb += way.wr_addr.eq(wr_addr)
1181             comb += way.wr_data.eq(wr_data)
1182
1183             # Cache hit reads
1184             with m.If(hwe.o[i]):
1185                 comb += cache_out_row.eq(d_out)
1186
1187             # these are mutually-exclusive via their Decoder-enablers
1188             # (note: Decoder-enable is inverted)
1189             comb += do_write.eq(hre.o[i] | rwe.o[i])
1190
1191             # Mask write selects with do_write since BRAM
1192             # doesn't have a global write-enable
1193             with m.If(do_write):
1194                 comb += wr_sel_m.eq(wr_sel)
1195
1196     # Cache hit synchronous machine for the easy case.
1197     # This handles load hits.
1198     # It also handles error cases (TLB miss, cache paradox)
1199     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1200                         req_hit_way, req_index, req_tag, access_ok,
1201                         tlb_hit, tlb_req_index):
1202         comb = m.d.comb
1203         sync = m.d.sync
1204
1205         with m.If(req_op != Op.OP_NONE):
1206             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1207                     req_op, r0.req.addr, r0.req.nc,
1208                     req_index, req_tag, req_hit_way)
1209
1210         with m.If(r0_valid):
1211             sync += r1.mmu_req.eq(r0.mmu_req)
1212
1213         # Fast path for load/store hits.
1214         # Set signals for the writeback controls.
1215         sync += r1.hit_way.eq(req_hit_way)
1216         sync += r1.hit_index.eq(req_index)
1217
1218         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1219         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1220                                 (req_op == Op.OP_STORE_HIT))
1221
1222         with m.If(req_op == Op.OP_BAD):
1223             sync += Display("Signalling ld/st error "
1224                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1225                             ~r0.mmu_req,r0.mmu_req,access_ok)
1226             sync += r1.ls_error.eq(~r0.mmu_req)
1227             sync += r1.mmu_error.eq(r0.mmu_req)
1228             sync += r1.cache_paradox.eq(access_ok)
1229         with m.Else():
1230             sync += r1.ls_error.eq(0)
1231             sync += r1.mmu_error.eq(0)
1232             sync += r1.cache_paradox.eq(0)
1233
1234         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1235
1236         # Record TLB hit information for updating TLB PLRU
1237         sync += r1.tlb_hit.eq(tlb_hit)
1238         sync += r1.tlb_hit_index.eq(tlb_req_index)
1239
1240     # Memory accesses are handled by this state machine:
1241     #
1242     #   * Cache load miss/reload (in conjunction with "rams")
1243     #   * Load hits for non-cachable forms
1244     #   * Stores (the collision case is handled in "rams")
1245     #
1246     # All wishbone requests generation is done here.
1247     # This machine operates at stage 1.
1248     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1249                     r0, replace_way,
1250                     req_hit_way, req_same_tag,
1251                     r0_valid, req_op, cache_tags, req_go, ra):
1252
1253         comb = m.d.comb
1254         sync = m.d.sync
1255         bus = self.bus
1256         d_in = self.d_in
1257
1258         req         = MemAccessRequest("mreq_ds")
1259
1260         req_row = Signal(ROW_BITS)
1261         req_idx = Signal(INDEX_BITS)
1262         req_tag = Signal(TAG_BITS)
1263         comb += req_idx.eq(get_index(req.real_addr))
1264         comb += req_row.eq(get_row(req.real_addr))
1265         comb += req_tag.eq(get_tag(req.real_addr))
1266
1267         sync += r1.use_forward1.eq(use_forward1_next)
1268         sync += r1.forward_sel.eq(0)
1269
1270         with m.If(use_forward1_next):
1271             sync += r1.forward_sel.eq(r1.req.byte_sel)
1272         with m.Elif(use_forward2_next):
1273             sync += r1.forward_sel.eq(r1.forward_sel1)
1274
1275         sync += r1.forward_data2.eq(r1.forward_data1)
1276         with m.If(r1.write_bram):
1277             sync += r1.forward_data1.eq(r1.req.data)
1278             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1279             sync += r1.forward_way1.eq(r1.req.hit_way)
1280             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1281             sync += r1.forward_valid1.eq(1)
1282         with m.Else():
1283             with m.If(r1.dcbz):
1284                 sync += r1.forward_data1.eq(0)
1285             with m.Else():
1286                 sync += r1.forward_data1.eq(bus.dat_r)
1287             sync += r1.forward_sel1.eq(~0) # all 1s
1288             sync += r1.forward_way1.eq(replace_way)
1289             sync += r1.forward_row1.eq(r1.store_row)
1290             sync += r1.forward_valid1.eq(0)
1291
1292         # One cycle pulses reset
1293         sync += r1.slow_valid.eq(0)
1294         sync += r1.write_bram.eq(0)
1295         sync += r1.inc_acks.eq(0)
1296         sync += r1.dec_acks.eq(0)
1297
1298         sync += r1.ls_valid.eq(0)
1299         # complete tlbies and TLB loads in the third cycle
1300         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1301
1302         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1303             with m.If(~r0.mmu_req):
1304                 sync += r1.ls_valid.eq(1)
1305             with m.Else():
1306                 sync += r1.mmu_done.eq(1)
1307
1308         with m.If(r1.write_tag):
1309             # Store new tag in selected way
1310             replace_way_onehot = Signal(NUM_WAYS)
1311             comb += replace_way_onehot.eq(1<<replace_way)
1312             for i in range(NUM_WAYS):
1313                 with m.If(replace_way_onehot[i]):
1314                     ct = Signal(TAG_RAM_WIDTH)
1315                     comb += ct.eq(cache_tags[r1.store_index].tag)
1316                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1317                     sync += cache_tags[r1.store_index].tag.eq(ct)
1318             sync += r1.store_way.eq(replace_way)
1319             sync += r1.write_tag.eq(0)
1320
1321         # Take request from r1.req if there is one there,
1322         # else from req_op, ra, etc.
1323         with m.If(r1.full):
1324             comb += req.eq(r1.req)
1325         with m.Else():
1326             comb += req.op.eq(req_op)
1327             comb += req.valid.eq(req_go)
1328             comb += req.mmu_req.eq(r0.mmu_req)
1329             comb += req.dcbz.eq(r0.req.dcbz)
1330             comb += req.real_addr.eq(ra)
1331
1332             with m.If(r0.req.dcbz):
1333                 # force data to 0 for dcbz
1334                 comb += req.data.eq(0)
1335             with m.Elif(r0.d_valid):
1336                 comb += req.data.eq(r0.req.data)
1337             with m.Else():
1338                 comb += req.data.eq(d_in.data)
1339
1340             # Select all bytes for dcbz
1341             # and for cacheable loads
1342             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1343                 comb += req.byte_sel.eq(~0) # all 1s
1344             with m.Else():
1345                 comb += req.byte_sel.eq(r0.req.byte_sel)
1346             comb += req.hit_way.eq(req_hit_way)
1347             comb += req.same_tag.eq(req_same_tag)
1348
1349             # Store the incoming request from r0,
1350             # if it is a slow request
1351             # Note that r1.full = 1 implies req_op = OP_NONE
1352             with m.If((req_op == Op.OP_LOAD_MISS)
1353                       | (req_op == Op.OP_LOAD_NC)
1354                       | (req_op == Op.OP_STORE_MISS)
1355                       | (req_op == Op.OP_STORE_HIT)):
1356                 sync += r1.req.eq(req)
1357                 sync += r1.full.eq(1)
1358
1359         # Main state machine
1360         with m.Switch(r1.state):
1361
1362             with m.Case(State.IDLE):
1363                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1364                 sync += r1.wb.sel.eq(req.byte_sel)
1365                 sync += r1.wb.dat.eq(req.data)
1366                 sync += r1.dcbz.eq(req.dcbz)
1367
1368                 # Keep track of our index and way
1369                 # for subsequent stores.
1370                 sync += r1.store_index.eq(req_idx)
1371                 sync += r1.store_row.eq(req_row)
1372                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1373                 sync += r1.reload_tag.eq(req_tag)
1374                 sync += r1.req.same_tag.eq(1)
1375
1376                 with m.If(req.op == Op.OP_STORE_HIT):
1377                     sync += r1.store_way.eq(req.hit_way)
1378
1379                 # Reset per-row valid bits,
1380                 # ready for handling OP_LOAD_MISS
1381                 for i in range(ROW_PER_LINE):
1382                     sync += r1.rows_valid[i].eq(0)
1383
1384                 with m.If(req_op != Op.OP_NONE):
1385                     sync += Display("cache op %d", req.op)
1386
1387                 with m.Switch(req.op):
1388                     with m.Case(Op.OP_LOAD_HIT):
1389                         # stay in IDLE state
1390                         pass
1391
1392                     with m.Case(Op.OP_LOAD_MISS):
1393                         sync += Display("cache miss real addr: %x " \
1394                                 "idx: %x tag: %x",
1395                                 req.real_addr, req_row, req_tag)
1396
1397                         # Start the wishbone cycle
1398                         sync += r1.wb.we.eq(0)
1399                         sync += r1.wb.cyc.eq(1)
1400                         sync += r1.wb.stb.eq(1)
1401
1402                         # Track that we had one request sent
1403                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1404                         sync += r1.write_tag.eq(1)
1405
1406                     with m.Case(Op.OP_LOAD_NC):
1407                         sync += r1.wb.cyc.eq(1)
1408                         sync += r1.wb.stb.eq(1)
1409                         sync += r1.wb.we.eq(0)
1410                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1411
1412                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1413                         with m.If(~req.dcbz):
1414                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1415                             sync += r1.acks_pending.eq(1)
1416                             sync += r1.full.eq(0)
1417                             sync += r1.slow_valid.eq(1)
1418
1419                             with m.If(~req.mmu_req):
1420                                 sync += r1.ls_valid.eq(1)
1421                             with m.Else():
1422                                 sync += r1.mmu_done.eq(1)
1423
1424                             with m.If(req.op == Op.OP_STORE_HIT):
1425                                 sync += r1.write_bram.eq(1)
1426                         with m.Else():
1427                             # dcbz is handled much like a load miss except
1428                             # that we are writing to memory instead of reading
1429                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1430
1431                             with m.If(req.op == Op.OP_STORE_MISS):
1432                                 sync += r1.write_tag.eq(1)
1433
1434                         sync += r1.wb.we.eq(1)
1435                         sync += r1.wb.cyc.eq(1)
1436                         sync += r1.wb.stb.eq(1)
1437
1438                     # OP_NONE and OP_BAD do nothing
1439                     # OP_BAD & OP_STCX_FAIL were
1440                     # handled above already
1441                     with m.Case(Op.OP_NONE):
1442                         pass
1443                     with m.Case(Op.OP_BAD):
1444                         pass
1445                     with m.Case(Op.OP_STCX_FAIL):
1446                         pass
1447
1448             with m.Case(State.RELOAD_WAIT_ACK):
1449                 ld_stbs_done = Signal()
1450                 # Requests are all sent if stb is 0
1451                 comb += ld_stbs_done.eq(~r1.wb.stb)
1452
1453                 # If we are still sending requests, was one accepted?
1454                 with m.If((~bus.stall) & r1.wb.stb):
1455                     # That was the last word?  We are done sending.
1456                     # Clear stb and set ld_stbs_done so we can handle an
1457                     # eventual last ack on the same cycle.
1458                     # sigh - reconstruct wb adr with 3 extra 0s at front
1459                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1460                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1461                         sync += r1.wb.stb.eq(0)
1462                         comb += ld_stbs_done.eq(1)
1463
1464                     # Calculate the next row address in the current cache line
1465                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1466                     comb += row.eq(r1.wb.adr)
1467                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1468
1469                 # Incoming acks processing
1470                 sync += r1.forward_valid1.eq(bus.ack)
1471                 with m.If(bus.ack):
1472                     srow = Signal(ROW_LINE_BITS)
1473                     comb += srow.eq(r1.store_row)
1474                     sync += r1.rows_valid[srow].eq(1)
1475
1476                     # If this is the data we were looking for,
1477                     # we can complete the request next cycle.
1478                     # Compare the whole address in case the
1479                     # request in r1.req is not the one that
1480                     # started this refill.
1481                     with m.If(req.valid & r1.req.same_tag &
1482                               ((r1.dcbz & r1.req.dcbz) |
1483                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1484                                 (r1.store_row == get_row(req.real_addr))):
1485                         sync += r1.full.eq(0)
1486                         sync += r1.slow_valid.eq(1)
1487                         with m.If(~r1.mmu_req):
1488                             sync += r1.ls_valid.eq(1)
1489                         with m.Else():
1490                             sync += r1.mmu_done.eq(1)
1491                         sync += r1.forward_sel.eq(~0) # all 1s
1492                         sync += r1.use_forward1.eq(1)
1493
1494                     # Check for completion
1495                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1496                                                       r1.end_row_ix)):
1497                         # Complete wishbone cycle
1498                         sync += r1.wb.cyc.eq(0)
1499
1500                         # Cache line is now valid
1501                         cv = Signal(INDEX_BITS)
1502                         comb += cv.eq(cache_tags[r1.store_index].valid)
1503                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1504                         sync += cache_tags[r1.store_index].valid.eq(cv)
1505
1506                         sync += r1.state.eq(State.IDLE)
1507                         sync += Display("cache valid set %x "
1508                                         "idx %d way %d",
1509                                          cv, r1.store_index, r1.store_way)
1510
1511                     # Increment store row counter
1512                     sync += r1.store_row.eq(next_row(r1.store_row))
1513
1514             with m.Case(State.STORE_WAIT_ACK):
1515                 st_stbs_done = Signal()
1516                 acks        = Signal(3)
1517                 adjust_acks = Signal(3)
1518
1519                 comb += st_stbs_done.eq(~r1.wb.stb)
1520                 comb += acks.eq(r1.acks_pending)
1521
1522                 with m.If(r1.inc_acks != r1.dec_acks):
1523                     with m.If(r1.inc_acks):
1524                         comb += adjust_acks.eq(acks + 1)
1525                     with m.Else():
1526                         comb += adjust_acks.eq(acks - 1)
1527                 with m.Else():
1528                     comb += adjust_acks.eq(acks)
1529
1530                 sync += r1.acks_pending.eq(adjust_acks)
1531
1532                 # Clear stb when slave accepted request
1533                 with m.If(~bus.stall):
1534                     # See if there is another store waiting
1535                     # to be done which is in the same real page.
1536                     with m.If(req.valid):
1537                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1538                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1539                         sync += r1.wb.dat.eq(req.data)
1540                         sync += r1.wb.sel.eq(req.byte_sel)
1541
1542                     with m.If((adjust_acks < 7) & req.same_tag &
1543                                 ((req.op == Op.OP_STORE_MISS)
1544                                  | (req.op == Op.OP_STORE_HIT))):
1545                         sync += r1.wb.stb.eq(1)
1546                         comb += st_stbs_done.eq(0)
1547
1548                         with m.If(req.op == Op.OP_STORE_HIT):
1549                             sync += r1.write_bram.eq(1)
1550                         sync += r1.full.eq(0)
1551                         sync += r1.slow_valid.eq(1)
1552
1553                         # Store requests never come from the MMU
1554                         sync += r1.ls_valid.eq(1)
1555                         comb += st_stbs_done.eq(0)
1556                         sync += r1.inc_acks.eq(1)
1557                     with m.Else():
1558                         sync += r1.wb.stb.eq(0)
1559                         comb += st_stbs_done.eq(1)
1560
1561                 # Got ack ? See if complete.
1562                 with m.If(bus.ack):
1563                     with m.If(st_stbs_done & (adjust_acks == 1)):
1564                         sync += r1.state.eq(State.IDLE)
1565                         sync += r1.wb.cyc.eq(0)
1566                         sync += r1.wb.stb.eq(0)
1567                     sync += r1.dec_acks.eq(1)
1568
1569             with m.Case(State.NC_LOAD_WAIT_ACK):
1570                 # Clear stb when slave accepted request
1571                 with m.If(~bus.stall):
1572                     sync += r1.wb.stb.eq(0)
1573
1574                 # Got ack ? complete.
1575                 with m.If(bus.ack):
1576                     sync += r1.state.eq(State.IDLE)
1577                     sync += r1.full.eq(0)
1578                     sync += r1.slow_valid.eq(1)
1579
1580                     with m.If(~r1.mmu_req):
1581                         sync += r1.ls_valid.eq(1)
1582                     with m.Else():
1583                         sync += r1.mmu_done.eq(1)
1584
1585                     sync += r1.forward_sel.eq(~0) # all 1s
1586                     sync += r1.use_forward1.eq(1)
1587                     sync += r1.wb.cyc.eq(0)
1588                     sync += r1.wb.stb.eq(0)
1589
1590     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1591
1592         sync = m.d.sync
1593         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1594
1595         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1596                                stall_out, req_op[:3], d_out.valid, d_out.error,
1597                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1598                                r1.real_adr[3:6]))
1599
1600     def elaborate(self, platform):
1601
1602         m = Module()
1603         comb = m.d.comb
1604         d_in = self.d_in
1605
1606         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1607         cache_tags       = CacheTagArray()
1608         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1609
1610         # TODO attribute ram_style : string;
1611         # TODO attribute ram_style of cache_tags : signal is "distributed";
1612
1613         """note: these are passed to nmigen.hdl.Memory as "attributes".
1614            don't know how, just that they are.
1615         """
1616         # TODO attribute ram_style of
1617         #  dtlb_tags : signal is "distributed";
1618         # TODO attribute ram_style of
1619         #  dtlb_ptes : signal is "distributed";
1620
1621         r0      = RegStage0("r0")
1622         r0_full = Signal()
1623
1624         r1 = RegStage1("r1")
1625
1626         reservation = Reservation()
1627
1628         # Async signals on incoming request
1629         req_index    = Signal(INDEX_BITS)
1630         req_row      = Signal(ROW_BITS)
1631         req_hit_way  = Signal(WAY_BITS)
1632         req_tag      = Signal(TAG_BITS)
1633         req_op       = Signal(Op)
1634         req_data     = Signal(64)
1635         req_same_tag = Signal()
1636         req_go       = Signal()
1637
1638         early_req_row     = Signal(ROW_BITS)
1639
1640         cancel_store      = Signal()
1641         set_rsrv          = Signal()
1642         clear_rsrv        = Signal()
1643
1644         r0_valid          = Signal()
1645         r0_stall          = Signal()
1646
1647         use_forward1_next = Signal()
1648         use_forward2_next = Signal()
1649
1650         cache_out_row     = Signal(WB_DATA_BITS)
1651
1652         plru_victim       = Signal(WAY_BITS)
1653         replace_way       = Signal(WAY_BITS)
1654
1655         # Wishbone read/write/cache write formatting signals
1656         bus_sel           = Signal(8)
1657
1658         # TLB signals
1659         tlb_way       = TLBRecord("tlb_way")
1660         tlb_req_index = Signal(TLB_SET_BITS)
1661         tlb_hit       = TLBHit("tlb_hit")
1662         pte           = Signal(TLB_PTE_BITS)
1663         ra            = Signal(REAL_ADDR_BITS)
1664         valid_ra      = Signal()
1665         perm_attr     = PermAttr("dc_perms")
1666         rc_ok         = Signal()
1667         perm_ok       = Signal()
1668         access_ok     = Signal()
1669
1670         tlb_plru_victim = Signal(TLB_WAY_BITS)
1671
1672         # we don't yet handle collisions between loadstore1 requests
1673         # and MMU requests
1674         comb += self.m_out.stall.eq(0)
1675
1676         # Hold off the request in r0 when r1 has an uncompleted request
1677         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1678         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1679         comb += self.stall_out.eq(r0_stall)
1680
1681         # deal with litex not doing wishbone pipeline mode
1682         # XXX in wrong way.  FIFOs are needed in the SRAM test
1683         # so that stb/ack match up. same thing done in icache.py
1684         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1685
1686         # Wire up wishbone request latch out of stage 1
1687         comb += self.bus.we.eq(r1.wb.we)
1688         comb += self.bus.adr.eq(r1.wb.adr)
1689         comb += self.bus.sel.eq(r1.wb.sel)
1690         comb += self.bus.stb.eq(r1.wb.stb)
1691         comb += self.bus.dat_w.eq(r1.wb.dat)
1692         comb += self.bus.cyc.eq(r1.wb.cyc)
1693
1694         # create submodule TLBUpdate
1695         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1696         dtlb = self.dtlb_update.dtlb
1697
1698         # call sub-functions putting everything together, using shared
1699         # signals established above
1700         self.stage_0(m, r0, r1, r0_full)
1701         self.tlb_read(m, r0_stall, tlb_way, dtlb)
1702         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1703                         tlb_way,
1704                         pte, tlb_hit, valid_ra, perm_attr, ra)
1705         self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
1706                         tlb_hit, tlb_plru_victim,
1707                         tlb_way)
1708         self.maybe_plrus(m, r1, plru_victim)
1709         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1710         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1711         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1712                            r0_valid, r1, cache_tags, replace_way,
1713                            use_forward1_next, use_forward2_next,
1714                            req_hit_way, plru_victim, rc_ok, perm_attr,
1715                            valid_ra, perm_ok, access_ok, req_op, req_go,
1716                            tlb_hit, tlb_way, cache_tag_set,
1717                            cancel_store, req_same_tag, r0_stall, early_req_row)
1718         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1719                            r0_valid, r0, reservation)
1720         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1721                            reservation, r0)
1722         self.writeback_control(m, r1, cache_out_row)
1723         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1724         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1725                         req_hit_way, req_index, req_tag, access_ok,
1726                         tlb_hit, tlb_req_index)
1727         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1728                     r0, replace_way,
1729                     req_hit_way, req_same_tag,
1730                          r0_valid, req_op, cache_tags, req_go, ra)
1731         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1732
1733         return m
1734
1735
1736 if __name__ == '__main__':
1737     dut = DCache()
1738     vl = rtlil.convert(dut, ports=[])
1739     with open("test_dcache.il", "w") as f:
1740         f.write(vl)