src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record)
  29 from nmutil.util import Display
  30
  31 from copy import deepcopy
  32 from random import randint, seed
  33
  34 from nmigen_soc.wishbone.bus import Interface
  35
  36 from nmigen.cli import main
  37 from nmutil.iocontrol import RecordObject
  38 from nmigen.utils import log2_int
  39 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  40                                      DCacheToLoadStore1Type,
  41                                      MMUToDCacheType,
  42                                      DCacheToMMUType)
  43
  44 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  45                                 WBAddrType, WBDataType, WBSelType,
  46                                 WBMasterOut, WBSlaveOut,
  47                                 WBMasterOutVector, WBSlaveOutVector,
  48                                 WBIOMasterOut, WBIOSlaveOut)
  49
  50 from soc.experiment.cache_ram import CacheRam
  51 #from soc.experiment.plru import PLRU
  52 from nmutil.plru import PLRU
  53
  54 # for test
  55 from soc.bus.sram import SRAM
  56 from nmigen import Memory
  57 from nmigen.cli import rtlil
  58
  59 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  60 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  61 from nmutil.sim_tmp_alternative import Simulator
  62
  63 from nmutil.util import wrap
  64
  65
  66 # TODO: make these parameters of DCache at some point
  67 LINE_SIZE = 64    # Line size in bytes
  68 NUM_LINES = 16    # Number of lines in a set
  69 NUM_WAYS = 4      # Number of ways
  70 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  71 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  72 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  73 LOG_LENGTH = 0    # Non-zero to enable log data collection
  74
  75 # BRAM organisation: We never access more than
  76 #     -- WB_DATA_BITS at a time so to save
  77 #     -- resources we make the array only that wide, and
  78 #     -- use consecutive indices to make a cache "line"
  79 #     --
  80 #     -- ROW_SIZE is the width in bytes of the BRAM
  81 #     -- (based on WB, so 64-bits)
  82 ROW_SIZE = WB_DATA_BITS // 8;
  83
  84 # ROW_PER_LINE is the number of row (wishbone
  85 # transactions) in a line
  86 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  87
  88 # BRAM_ROWS is the number of rows in BRAM needed
  89 # to represent the full dcache
  90 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  91
  92 print ("ROW_SIZE", ROW_SIZE)
  93 print ("ROW_PER_LINE", ROW_PER_LINE)
  94 print ("BRAM_ROWS", BRAM_ROWS)
  95 print ("NUM_WAYS", NUM_WAYS)
  96
  97 # Bit fields counts in the address
  98
  99 # REAL_ADDR_BITS is the number of real address
 100 # bits that we store
 101 REAL_ADDR_BITS = 56
 102
 103 # ROW_BITS is the number of bits to select a row
 104 ROW_BITS = log2_int(BRAM_ROWS)
 105
 106 # ROW_LINE_BITS is the number of bits to select
 107 # a row within a line
 108 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 109
 110 # LINE_OFF_BITS is the number of bits for
 111 # the offset in a cache line
 112 LINE_OFF_BITS = log2_int(LINE_SIZE)
 113
 114 # ROW_OFF_BITS is the number of bits for
 115 # the offset in a row
 116 ROW_OFF_BITS = log2_int(ROW_SIZE)
 117
 118 # INDEX_BITS is the number if bits to
 119 # select a cache line
 120 INDEX_BITS = log2_int(NUM_LINES)
 121
 122 # SET_SIZE_BITS is the log base 2 of the set size
 123 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 124
 125 # TAG_BITS is the number of bits of
 126 # the tag part of the address
 127 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 128
 129 # TAG_WIDTH is the width in bits of each way of the tag RAM
 130 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 131
 132 # WAY_BITS is the number of bits to select a way
 133 WAY_BITS = log2_int(NUM_WAYS)
 134
 135 # Example of layout for 32 lines of 64 bytes:
 136 layout = """\
 137   ..  tag    |index|  line  |
 138   ..         |   row   |    |
 139   ..         |     |---|    | ROW_LINE_BITS  (3)
 140   ..         |     |--- - --| LINE_OFF_BITS (6)
 141   ..         |         |- --| ROW_OFF_BITS  (3)
 142   ..         |----- ---|    | ROW_BITS      (8)
 143   ..         |-----|        | INDEX_BITS    (5)
 144   .. --------|              | TAG_BITS      (45)
 145 """
 146 print (layout)
 147 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 148             (TAG_BITS, INDEX_BITS, ROW_BITS,
 149              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 150 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 151 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 152 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 153
 154 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 155
 156 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 157
 158 def CacheTagArray():
 159     tag_layout = [('valid', 1),
 160                   ('tag', TAG_RAM_WIDTH),
 161                  ]
 162     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 163
 164 def RowPerLineValidArray():
 165     return Array(Signal(name="rows_valid%d" % x) \
 166                         for x in range(ROW_PER_LINE))
 167
 168 # L1 TLB
 169 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 170 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 171 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 172 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 173 TLB_PTE_BITS     = 64
 174 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 175
 176 def ispow2(x):
 177     return (1<<log2_int(x, False)) == x
 178
 179 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 180 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 181 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 182 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 183 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 184 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 185         "geometry bits don't add up"
 186 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 187         "geometry bits don't add up"
 188 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 189          "geometry bits don't add up"
 190 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 191 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 192
 193
 194 def TLBTagEAArray():
 195     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 196                 for x in range (TLB_NUM_WAYS))
 197
 198 def TLBArray():
 199     tlb_layout = [('valid', 1),
 200                   ('tag', TLB_TAG_WAY_BITS),
 201                   ('pte', TLB_PTE_WAY_BITS)
 202                  ]
 203     return Array(Record(tlb_layout, name="tlb%d" % x) \
 204                         for x in range(TLB_SET_SIZE))
 205
 206 def HitWaySet():
 207     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 208                         for x in range(TLB_NUM_WAYS))
 209
 210 # Cache RAM interface
 211 def CacheRamOut():
 212     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 213                  for x in range(NUM_WAYS))
 214
 215 # PLRU output interface
 216 def PLRUOut():
 217     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 218                 for x in range(NUM_LINES))
 219
 220 # TLB PLRU output interface
 221 def TLBPLRUOut():
 222     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 223                 for x in range(TLB_SET_SIZE))
 224
 225 # Helper functions to decode incoming requests
 226 #
 227 # Return the cache line index (tag index) for an address
 228 def get_index(addr):
 229     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 230
 231 # Return the cache row index (data memory) for an address
 232 def get_row(addr):
 233     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 234
 235 # Return the index of a row within a line
 236 def get_row_of_line(row):
 237     return row[:ROW_BITS][:ROW_LINE_BITS]
 238
 239 # Returns whether this is the last row of a line
 240 def is_last_row_addr(addr, last):
 241     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 242
 243 # Returns whether this is the last row of a line
 244 def is_last_row(row, last):
 245     return get_row_of_line(row) == last
 246
 247 # Return the next row in the current cache line. We use a
 248 # dedicated function in order to limit the size of the
 249 # generated adder to be only the bits within a cache line
 250 # (3 bits with default settings)
 251 def next_row(row):
 252     row_v = row[0:ROW_LINE_BITS] + 1
 253     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 254
 255 # Get the tag value from the address
 256 def get_tag(addr):
 257     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 258
 259 # Read a tag from a tag memory row
 260 def read_tag(way, tagset):
 261     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 262
 263 # Read a TLB tag from a TLB tag memory row
 264 def read_tlb_tag(way, tags):
 265     return tags.word_select(way, TLB_EA_TAG_BITS)
 266
 267 # Write a TLB tag to a TLB tag memory row
 268 def write_tlb_tag(way, tags, tag):
 269     return read_tlb_tag(way, tags).eq(tag)
 270
 271 # Read a PTE from a TLB PTE memory row
 272 def read_tlb_pte(way, ptes):
 273     return ptes.word_select(way, TLB_PTE_BITS)
 274
 275 def write_tlb_pte(way, ptes, newpte):
 276     return read_tlb_pte(way, ptes).eq(newpte)
 277
 278
 279 # Record for storing permission, attribute, etc. bits from a PTE
 280 class PermAttr(RecordObject):
 281     def __init__(self, name=None):
 282         super().__init__(name=name)
 283         self.reference = Signal()
 284         self.changed   = Signal()
 285         self.nocache   = Signal()
 286         self.priv      = Signal()
 287         self.rd_perm   = Signal()
 288         self.wr_perm   = Signal()
 289
 290
 291 def extract_perm_attr(pte):
 292     pa = PermAttr()
 293     return pa;
 294
 295
 296 # Type of operation on a "valid" input
 297 @unique
 298 class Op(Enum):
 299     OP_NONE       = 0
 300     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 301     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 302     OP_LOAD_HIT   = 3 # Cache hit on load
 303     OP_LOAD_MISS  = 4 # Load missing cache
 304     OP_LOAD_NC    = 5 # Non-cachable load
 305     OP_STORE_HIT  = 6 # Store hitting cache
 306     OP_STORE_MISS = 7 # Store missing cache
 307
 308
 309 # Cache state machine
 310 @unique
 311 class State(Enum):
 312     IDLE             = 0 # Normal load hit processing
 313     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 314     STORE_WAIT_ACK   = 2 # Store wait ack
 315     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 316
 317
 318 # Dcache operations:
 319 #
 320 # In order to make timing, we use the BRAMs with
 321 # an output buffer, which means that the BRAM
 322 # output is delayed by an extra cycle.
 323 #
 324 # Thus, the dcache has a 2-stage internal pipeline
 325 # for cache hits with no stalls.
 326 #
 327 # All other operations are handled via stalling
 328 # in the first stage.
 329 #
 330 # The second stage can thus complete a hit at the same
 331 # time as the first stage emits a stall for a complex op.
 332 #
 333 # Stage 0 register, basically contains just the latched request
 334
 335 class RegStage0(RecordObject):
 336     def __init__(self, name=None):
 337         super().__init__(name=name)
 338         self.req     = LoadStore1ToDCacheType(name="lsmem")
 339         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 340         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 341         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 342         self.mmu_req = Signal() # indicates source of request
 343         self.d_valid = Signal() # indicates req.data is valid now
 344
 345
 346 class MemAccessRequest(RecordObject):
 347     def __init__(self, name=None):
 348         super().__init__(name=name)
 349         self.op        = Signal(Op)
 350         self.valid     = Signal()
 351         self.dcbz      = Signal()
 352         self.real_addr = Signal(REAL_ADDR_BITS)
 353         self.data      = Signal(64)
 354         self.byte_sel  = Signal(8)
 355         self.hit_way   = Signal(WAY_BITS)
 356         self.same_tag  = Signal()
 357         self.mmu_req   = Signal()
 358
 359
 360 # First stage register, contains state for stage 1 of load hits
 361 # and for the state machine used by all other operations
 362 class RegStage1(RecordObject):
 363     def __init__(self, name=None):
 364         super().__init__(name=name)
 365         # Info about the request
 366         self.full             = Signal() # have uncompleted request
 367         self.mmu_req          = Signal() # request is from MMU
 368         self.req              = MemAccessRequest(name="reqmem")
 369
 370         # Cache hit state
 371         self.hit_way          = Signal(WAY_BITS)
 372         self.hit_load_valid   = Signal()
 373         self.hit_index        = Signal(INDEX_BITS)
 374         self.cache_hit        = Signal()
 375
 376         # TLB hit state
 377         self.tlb_hit          = Signal()
 378         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 379         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 380
 381         # 2-stage data buffer for data forwarded from writes to reads
 382         self.forward_data1    = Signal(64)
 383         self.forward_data2    = Signal(64)
 384         self.forward_sel1     = Signal(8)
 385         self.forward_valid1   = Signal()
 386         self.forward_way1     = Signal(WAY_BITS)
 387         self.forward_row1     = Signal(ROW_BITS)
 388         self.use_forward1     = Signal()
 389         self.forward_sel      = Signal(8)
 390
 391         # Cache miss state (reload state machine)
 392         self.state            = Signal(State)
 393         self.dcbz             = Signal()
 394         self.write_bram       = Signal()
 395         self.write_tag        = Signal()
 396         self.slow_valid       = Signal()
 397         self.wb               = WBMasterOut("wb")
 398         self.reload_tag       = Signal(TAG_BITS)
 399         self.store_way        = Signal(WAY_BITS)
 400         self.store_row        = Signal(ROW_BITS)
 401         self.store_index      = Signal(INDEX_BITS)
 402         self.end_row_ix       = Signal(ROW_LINE_BITS)
 403         self.rows_valid       = RowPerLineValidArray()
 404         self.acks_pending     = Signal(3)
 405         self.inc_acks         = Signal()
 406         self.dec_acks         = Signal()
 407
 408         # Signals to complete (possibly with error)
 409         self.ls_valid         = Signal()
 410         self.ls_error         = Signal()
 411         self.mmu_done         = Signal()
 412         self.mmu_error        = Signal()
 413         self.cache_paradox    = Signal()
 414
 415         # Signal to complete a failed stcx.
 416         self.stcx_fail        = Signal()
 417
 418
 419 # Reservation information
 420 class Reservation(RecordObject):
 421     def __init__(self):
 422         super().__init__()
 423         self.valid = Signal()
 424         self.addr  = Signal(64-LINE_OFF_BITS)
 425
 426
 427 class DTLBUpdate(Elaboratable):
 428     def __init__(self):
 429         self.tlbie    = Signal()
 430         self.tlbwe    = Signal()
 431         self.doall    = Signal()
 432         self.updated  = Signal()
 433         self.v_updated  = Signal()
 434         self.tlb_hit    = Signal()
 435         self.tlb_req_index = Signal(TLB_SET_BITS)
 436
 437         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 438         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 439         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 440         self.repl_way        = Signal(TLB_WAY_BITS)
 441         self.eatag           = Signal(TLB_EA_TAG_BITS)
 442         self.pte_data        = Signal(TLB_PTE_BITS)
 443
 444         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 445
 446         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 447         self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 448         self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 449
 450     def elaborate(self, platform):
 451         m = Module()
 452         comb = m.d.comb
 453         sync = m.d.sync
 454
 455         tagset   = Signal(TLB_TAG_WAY_BITS)
 456         pteset   = Signal(TLB_PTE_WAY_BITS)
 457
 458         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 459         comb += db_out.eq(self.dv)
 460
 461         with m.If(self.tlbie & self.doall):
 462             pass # clear all back in parent
 463         with m.Elif(self.tlbie):
 464             with m.If(self.tlb_hit):
 465                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(0)
 466                 comb += self.v_updated.eq(1)
 467
 468         with m.Elif(self.tlbwe):
 469
 470             comb += tagset.eq(self.tlb_tag_way)
 471             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 472             comb += tb_out.eq(tagset)
 473
 474             comb += pteset.eq(self.tlb_pte_way)
 475             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 476             comb += pb_out.eq(pteset)
 477
 478             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 479
 480             comb += self.updated.eq(1)
 481             comb += self.v_updated.eq(1)
 482
 483         return m
 484
 485
 486 class DCachePendingHit(Elaboratable):
 487
 488     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 489                       cache_i_validdx, cache_tag_set,
 490                     req_addr,
 491                     hit_set):
 492
 493         self.go          = Signal()
 494         self.virt_mode   = Signal()
 495         self.is_hit      = Signal()
 496         self.tlb_hit     = Signal()
 497         self.hit_way     = Signal(WAY_BITS)
 498         self.rel_match   = Signal()
 499         self.req_index   = Signal(INDEX_BITS)
 500         self.reload_tag  = Signal(TAG_BITS)
 501
 502         self.tlb_hit_way = tlb_hit_way
 503         self.tlb_pte_way = tlb_pte_way
 504         self.tlb_valid_way = tlb_valid_way
 505         self.cache_i_validdx = cache_i_validdx
 506         self.cache_tag_set = cache_tag_set
 507         self.req_addr = req_addr
 508         self.hit_set = hit_set
 509
 510     def elaborate(self, platform):
 511         m = Module()
 512         comb = m.d.comb
 513         sync = m.d.sync
 514
 515         go = self.go
 516         virt_mode = self.virt_mode
 517         is_hit = self.is_hit
 518         tlb_pte_way = self.tlb_pte_way
 519         tlb_valid_way = self.tlb_valid_way
 520         cache_i_validdx = self.cache_i_validdx
 521         cache_tag_set = self.cache_tag_set
 522         req_addr = self.req_addr
 523         tlb_hit_way = self.tlb_hit_way
 524         tlb_hit = self.tlb_hit
 525         hit_set = self.hit_set
 526         hit_way = self.hit_way
 527         rel_match = self.rel_match
 528         req_index = self.req_index
 529         reload_tag = self.reload_tag
 530
 531         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 532                                     for i in range(TLB_NUM_WAYS))
 533         hit_way_set = HitWaySet()
 534
 535         # Test if pending request is a hit on any way
 536         # In order to make timing in virtual mode,
 537         # when we are using the TLB, we compare each
 538         # way with each of the real addresses from each way of
 539         # the TLB, and then decide later which match to use.
 540
 541         with m.If(virt_mode):
 542             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 543                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 544                 s_hit       = Signal()
 545                 s_pte       = Signal(TLB_PTE_BITS)
 546                 s_ra        = Signal(REAL_ADDR_BITS)
 547                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 548                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 549                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 550                 comb += s_tag.eq(get_tag(s_ra))
 551
 552                 for i in range(NUM_WAYS): # way_t
 553                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 554                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 555                                   (read_tag(i, cache_tag_set) == s_tag)
 556                                   & tlb_valid_way[j])
 557                     with m.If(is_tag_hit):
 558                         comb += hit_way_set[j].eq(i)
 559                         comb += s_hit.eq(1)
 560                 comb += hit_set[j].eq(s_hit)
 561                 with m.If(s_tag == reload_tag):
 562                     comb += rel_matches[j].eq(1)
 563             with m.If(tlb_hit):
 564                 comb += is_hit.eq(hit_set[tlb_hit_way])
 565                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 566                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 567         with m.Else():
 568             s_tag       = Signal(TAG_BITS)
 569             comb += s_tag.eq(get_tag(req_addr))
 570             for i in range(NUM_WAYS): # way_t
 571                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 572                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 573                           (read_tag(i, cache_tag_set) == s_tag))
 574                 with m.If(is_tag_hit):
 575                     comb += hit_way.eq(i)
 576                     comb += is_hit.eq(1)
 577             with m.If(s_tag == reload_tag):
 578                 comb += rel_match.eq(1)
 579
 580         return m
 581
 582
 583 class DCache(Elaboratable):
 584     """Set associative dcache write-through
 585
 586     TODO (in no specific order):
 587     * See list in icache.vhdl
 588     * Complete load misses on the cycle when WB data comes instead of
 589       at the end of line (this requires dealing with requests coming in
 590       while not idle...)
 591     """
 592     def __init__(self):
 593         self.d_in      = LoadStore1ToDCacheType("d_in")
 594         self.d_out     = DCacheToLoadStore1Type("d_out")
 595
 596         self.m_in      = MMUToDCacheType("m_in")
 597         self.m_out     = DCacheToMMUType("m_out")
 598
 599         self.stall_out = Signal()
 600
 601         # standard naming (wired to non-standard for compatibility)
 602         self.bus = Interface(addr_width=32,
 603                             data_width=64,
 604                             granularity=8,
 605                             features={'stall'},
 606                             alignment=0,
 607                             name="dcache")
 608
 609         self.log_out   = Signal(20)
 610
 611     def stage_0(self, m, r0, r1, r0_full):
 612         """Latch the request in r0.req as long as we're not stalling
 613         """
 614         comb = m.d.comb
 615         sync = m.d.sync
 616         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 617
 618         r = RegStage0("stage0")
 619
 620         # TODO, this goes in unit tests and formal proofs
 621         with m.If(d_in.valid & m_in.valid):
 622             sync += Display("request collision loadstore vs MMU")
 623
 624         with m.If(m_in.valid):
 625             comb += r.req.valid.eq(1)
 626             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 627             comb += r.req.dcbz.eq(0)
 628             comb += r.req.nc.eq(0)
 629             comb += r.req.reserve.eq(0)
 630             comb += r.req.virt_mode.eq(0)
 631             comb += r.req.priv_mode.eq(1)
 632             comb += r.req.addr.eq(m_in.addr)
 633             comb += r.req.data.eq(m_in.pte)
 634             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 635             comb += r.tlbie.eq(m_in.tlbie)
 636             comb += r.doall.eq(m_in.doall)
 637             comb += r.tlbld.eq(m_in.tlbld)
 638             comb += r.mmu_req.eq(1)
 639             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 640                                  m_in.addr, m_in.pte, r.req.load)
 641
 642         with m.Else():
 643             comb += r.req.eq(d_in)
 644             comb += r.req.data.eq(0)
 645             comb += r.tlbie.eq(0)
 646             comb += r.doall.eq(0)
 647             comb += r.tlbld.eq(0)
 648             comb += r.mmu_req.eq(0)
 649         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 650             sync += r0.eq(r)
 651             sync += r0_full.eq(r.req.valid)
 652             # Sample data the cycle after a request comes in from loadstore1.
 653             # If another request has come in already then the data will get
 654             # put directly into req.data below.
 655             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 656                      ~r0.mmu_req):
 657                 sync += r0.req.data.eq(d_in.data)
 658                 sync += r0.d_valid.eq(1)
 659         with m.If(d_in.valid):
 660             m.d.sync += Display("    DCACHE req cache "
 661                                 "virt %d addr %x data %x ld %d",
 662                                  r.req.virt_mode, r.req.addr,
 663                                  r.req.data, r.req.load)
 664
 665     def tlb_read(self, m, r0_stall, tlb_valid_way,
 666                  tlb_tag_way, tlb_pte_way, dtlb):
 667         """TLB
 668         Operates in the second cycle on the request latched in r0.req.
 669         TLB updates write the entry at the end of the second cycle.
 670         """
 671         comb = m.d.comb
 672         sync = m.d.sync
 673         m_in, d_in = self.m_in, self.d_in
 674
 675         index    = Signal(TLB_SET_BITS)
 676         addrbits = Signal(TLB_SET_BITS)
 677
 678         amin = TLB_LG_PGSZ
 679         amax = TLB_LG_PGSZ + TLB_SET_BITS
 680
 681         with m.If(m_in.valid):
 682             comb += addrbits.eq(m_in.addr[amin : amax])
 683         with m.Else():
 684             comb += addrbits.eq(d_in.addr[amin : amax])
 685         comb += index.eq(addrbits)
 686
 687         # If we have any op and the previous op isn't finished,
 688         # then keep the same output for next cycle.
 689         with m.If(~r0_stall):
 690             sync += tlb_valid_way.eq(dtlb[index].valid)
 691             sync += tlb_tag_way.eq(dtlb[index].tag)
 692             sync += tlb_pte_way.eq(dtlb[index].pte)
 693
 694     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 695         """Generate TLB PLRUs
 696         """
 697         comb = m.d.comb
 698         sync = m.d.sync
 699
 700         if TLB_NUM_WAYS == 0:
 701             return
 702         for i in range(TLB_SET_SIZE):
 703             # TLB PLRU interface
 704             tlb_plru        = PLRU(TLB_WAY_BITS)
 705             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 706             tlb_plru_acc_en = Signal()
 707
 708             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 709             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 710             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 711             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 712
 713     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 714                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 715                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 716
 717         comb = m.d.comb
 718
 719         hitway = Signal(TLB_WAY_BITS)
 720         hit    = Signal()
 721         eatag  = Signal(TLB_EA_TAG_BITS)
 722
 723         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 724         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 725         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 726
 727         for i in range(TLB_NUM_WAYS):
 728             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 729             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 730             comb += tlb_tag.eq(read_tlb_tag(i, tlb_tag_way))
 731             comb += is_tag_hit.eq(tlb_valid_way[i] & (tlb_tag == eatag))
 732             with m.If(is_tag_hit):
 733                 comb += hitway.eq(i)
 734                 comb += hit.eq(1)
 735
 736         comb += tlb_hit.eq(hit & r0_valid)
 737         comb += tlb_hit_way.eq(hitway)
 738
 739         with m.If(tlb_hit):
 740             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 741         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 742
 743         with m.If(r0.req.virt_mode):
 744             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 745                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 746                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 747             comb += perm_attr.reference.eq(pte[8])
 748             comb += perm_attr.changed.eq(pte[7])
 749             comb += perm_attr.nocache.eq(pte[5])
 750             comb += perm_attr.priv.eq(pte[3])
 751             comb += perm_attr.rd_perm.eq(pte[2])
 752             comb += perm_attr.wr_perm.eq(pte[1])
 753         with m.Else():
 754             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 755                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 756             comb += perm_attr.reference.eq(1)
 757             comb += perm_attr.changed.eq(1)
 758             comb += perm_attr.nocache.eq(0)
 759             comb += perm_attr.priv.eq(1)
 760             comb += perm_attr.rd_perm.eq(1)
 761             comb += perm_attr.wr_perm.eq(1)
 762
 763         with m.If(valid_ra):
 764             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 765                                 r0.req.virt_mode, tlb_hit, ra, pte)
 766             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 767             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 768             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 769             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 770             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 771             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 772
 773     def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
 774                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 775                     tlb_pte_way):
 776
 777         comb = m.d.comb
 778         sync = m.d.sync
 779
 780         tlbie    = Signal()
 781         tlbwe    = Signal()
 782
 783         comb += tlbie.eq(r0_valid & r0.tlbie)
 784         comb += tlbwe.eq(r0_valid & r0.tlbld)
 785
 786         m.submodules.tlb_update = d = DTLBUpdate()
 787         with m.If(tlbie & r0.doall):
 788             # clear all valid bits at once
 789             for i in range(TLB_SET_SIZE):
 790                 sync += dtlb[i].valid.eq(0)
 791         with m.If(d.updated):
 792             sync += dtlb[tlb_req_index].tag.eq(d.tb_out)
 793             sync += dtlb[tlb_req_index].pte.eq(d.pb_out)
 794         with m.If(d.v_updated):
 795             sync += dtlb[tlb_req_index].valid.eq(d.db_out)
 796
 797         comb += d.dv.eq(dtlb[tlb_req_index].valid)
 798
 799         comb += d.tlbie.eq(tlbie)
 800         comb += d.tlbwe.eq(tlbwe)
 801         comb += d.doall.eq(r0.doall)
 802         comb += d.tlb_hit.eq(tlb_hit)
 803         comb += d.tlb_hit_way.eq(tlb_hit_way)
 804         comb += d.tlb_tag_way.eq(tlb_tag_way)
 805         comb += d.tlb_pte_way.eq(tlb_pte_way)
 806         comb += d.tlb_req_index.eq(tlb_req_index)
 807
 808         with m.If(tlb_hit):
 809             comb += d.repl_way.eq(tlb_hit_way)
 810         with m.Else():
 811             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 812         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 813         comb += d.pte_data.eq(r0.req.data)
 814
 815     def maybe_plrus(self, m, r1, plru_victim):
 816         """Generate PLRUs
 817         """
 818         comb = m.d.comb
 819         sync = m.d.sync
 820
 821         if TLB_NUM_WAYS == 0:
 822             return
 823
 824         for i in range(NUM_LINES):
 825             # PLRU interface
 826             plru        = PLRU(WAY_BITS)
 827             setattr(m.submodules, "plru%d" % i, plru)
 828             plru_acc_en = Signal()
 829
 830             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 831             comb += plru.acc_en.eq(plru_acc_en)
 832             comb += plru.acc_i.eq(r1.hit_way)
 833             comb += plru_victim[i].eq(plru.lru_o)
 834
 835     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 836         """Cache tag RAM read port
 837         """
 838         comb = m.d.comb
 839         sync = m.d.sync
 840         m_in, d_in = self.m_in, self.d_in
 841
 842         index = Signal(INDEX_BITS)
 843
 844         with m.If(r0_stall):
 845             comb += index.eq(req_index)
 846         with m.Elif(m_in.valid):
 847             comb += index.eq(get_index(m_in.addr))
 848         with m.Else():
 849             comb += index.eq(get_index(d_in.addr))
 850         sync += cache_tag_set.eq(cache_tags[index].tag)
 851
 852     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 853                        r0_valid, r1, cache_tags, replace_way,
 854                        use_forward1_next, use_forward2_next,
 855                        req_hit_way, plru_victim, rc_ok, perm_attr,
 856                        valid_ra, perm_ok, access_ok, req_op, req_go,
 857                        tlb_pte_way,
 858                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 859                        cancel_store, req_same_tag, r0_stall, early_req_row):
 860         """Cache request parsing and hit detection
 861         """
 862
 863         comb = m.d.comb
 864         m_in, d_in = self.m_in, self.d_in
 865
 866         is_hit      = Signal()
 867         hit_way     = Signal(WAY_BITS)
 868         op          = Signal(Op)
 869         opsel       = Signal(3)
 870         go          = Signal()
 871         nc          = Signal()
 872         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 873                                   for i in range(TLB_NUM_WAYS))
 874         cache_i_validdx = Signal(NUM_WAYS)
 875
 876         # Extract line, row and tag from request
 877         comb += req_index.eq(get_index(r0.req.addr))
 878         comb += req_row.eq(get_row(r0.req.addr))
 879         comb += req_tag.eq(get_tag(ra))
 880
 881         if False: # display on comb is a bit... busy.
 882             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 883                     r0.req.addr, ra, req_index, req_tag, req_row)
 884
 885         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 886         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 887
 888         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 889                                 tlb_valid_way, tlb_hit_way,
 890                                 cache_i_validdx, cache_tag_set,
 891                                 r0.req.addr,
 892                                 hit_set)
 893
 894         comb += dc.tlb_hit.eq(tlb_hit)
 895         comb += dc.reload_tag.eq(r1.reload_tag)
 896         comb += dc.virt_mode.eq(r0.req.virt_mode)
 897         comb += dc.go.eq(go)
 898         comb += dc.req_index.eq(req_index)
 899         comb += is_hit.eq(dc.is_hit)
 900         comb += hit_way.eq(dc.hit_way)
 901         comb += req_same_tag.eq(dc.rel_match)
 902
 903         # See if the request matches the line currently being reloaded
 904         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 905                   (req_index == r1.store_index) & req_same_tag):
 906             # For a store, consider this a hit even if the row isn't
 907             # valid since it will be by the time we perform the store.
 908             # For a load, check the appropriate row valid bit.
 909             rrow = Signal(ROW_LINE_BITS)
 910             comb += rrow.eq(req_row)
 911             valid = r1.rows_valid[rrow]
 912             comb += is_hit.eq((~r0.req.load) | valid)
 913             comb += hit_way.eq(replace_way)
 914
 915         # Whether to use forwarded data for a load or not
 916         with m.If((get_row(r1.req.real_addr) == req_row) &
 917                   (r1.req.hit_way == hit_way)):
 918             # Only need to consider r1.write_bram here, since if we
 919             # are writing refill data here, then we don't have a
 920             # cache hit this cycle on the line being refilled.
 921             # (There is the possibility that the load following the
 922             # load miss that started the refill could be to the old
 923             # contents of the victim line, since it is a couple of
 924             # cycles after the refill starts before we see the updated
 925             # cache tag. In that case we don't use the bypass.)
 926             comb += use_forward1_next.eq(r1.write_bram)
 927         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 928             comb += use_forward2_next.eq(r1.forward_valid1)
 929
 930         # The way that matched on a hit
 931         comb += req_hit_way.eq(hit_way)
 932
 933         # The way to replace on a miss
 934         with m.If(r1.write_tag):
 935             comb += replace_way.eq(plru_victim[r1.store_index])
 936         with m.Else():
 937             comb += replace_way.eq(r1.store_way)
 938
 939         # work out whether we have permission for this access
 940         # NB we don't yet implement AMR, thus no KUAP
 941         comb += rc_ok.eq(perm_attr.reference
 942                          & (r0.req.load | perm_attr.changed))
 943         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 944                            (perm_attr.wr_perm |
 945                               (r0.req.load & perm_attr.rd_perm)))
 946         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 947         # Combine the request and cache hit status to decide what
 948         # operation needs to be done
 949         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 950         comb += op.eq(Op.OP_NONE)
 951         with m.If(go):
 952             with m.If(~access_ok):
 953                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
 954                                  valid_ra, perm_ok, rc_ok)
 955                 comb += op.eq(Op.OP_BAD)
 956             with m.Elif(cancel_store):
 957                 m.d.sync += Display("DCACHE cancel store")
 958                 comb += op.eq(Op.OP_STCX_FAIL)
 959             with m.Else():
 960                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
 961                                  valid_ra, nc, r0.req.load)
 962                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 963                 with m.Switch(opsel):
 964                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 965                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 966                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 967                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 968                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 969                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 970                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 971                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 972         comb += req_op.eq(op)
 973         comb += req_go.eq(go)
 974
 975         # Version of the row number that is valid one cycle earlier
 976         # in the cases where we need to read the cache data BRAM.
 977         # If we're stalling then we need to keep reading the last
 978         # row requested.
 979         with m.If(~r0_stall):
 980             with m.If(m_in.valid):
 981                 comb += early_req_row.eq(get_row(m_in.addr))
 982             with m.Else():
 983                 comb += early_req_row.eq(get_row(d_in.addr))
 984         with m.Else():
 985             comb += early_req_row.eq(req_row)
 986
 987     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 988                          r0_valid, r0, reservation):
 989         """Handle load-with-reservation and store-conditional instructions
 990         """
 991         comb = m.d.comb
 992
 993         with m.If(r0_valid & r0.req.reserve):
 994             # XXX generate alignment interrupt if address
 995             # is not aligned XXX or if r0.req.nc = '1'
 996             with m.If(r0.req.load):
 997                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
 998             with m.Else():
 999                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1000                 with m.If((~reservation.valid) |
1001                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1002                     comb += cancel_store.eq(1)
1003
1004     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1005                         reservation, r0):
1006
1007         comb = m.d.comb
1008         sync = m.d.sync
1009
1010         with m.If(r0_valid & access_ok):
1011             with m.If(clear_rsrv):
1012                 sync += reservation.valid.eq(0)
1013             with m.Elif(set_rsrv):
1014                 sync += reservation.valid.eq(1)
1015                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1016
1017     def writeback_control(self, m, r1, cache_out_row):
1018         """Return data for loads & completion control logic
1019         """
1020         comb = m.d.comb
1021         sync = m.d.sync
1022         d_out, m_out = self.d_out, self.m_out
1023
1024         data_out = Signal(64)
1025         data_fwd = Signal(64)
1026
1027         # Use the bypass if are reading the row that was
1028         # written 1 or 2 cycles ago, including for the
1029         # slow_valid = 1 case (i.e. completing a load
1030         # miss or a non-cacheable load).
1031         with m.If(r1.use_forward1):
1032             comb += data_fwd.eq(r1.forward_data1)
1033         with m.Else():
1034             comb += data_fwd.eq(r1.forward_data2)
1035
1036         comb += data_out.eq(cache_out_row)
1037
1038         for i in range(8):
1039             with m.If(r1.forward_sel[i]):
1040                 dsel = data_fwd.word_select(i, 8)
1041                 comb += data_out.word_select(i, 8).eq(dsel)
1042
1043         comb += d_out.valid.eq(r1.ls_valid)
1044         comb += d_out.data.eq(data_out)
1045         comb += d_out.store_done.eq(~r1.stcx_fail)
1046         comb += d_out.error.eq(r1.ls_error)
1047         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1048
1049         # Outputs to MMU
1050         comb += m_out.done.eq(r1.mmu_done)
1051         comb += m_out.err.eq(r1.mmu_error)
1052         comb += m_out.data.eq(data_out)
1053
1054         # We have a valid load or store hit or we just completed
1055         # a slow op such as a load miss, a NC load or a store
1056         #
1057         # Note: the load hit is delayed by one cycle. However it
1058         # can still not collide with r.slow_valid (well unless I
1059         # miscalculated) because slow_valid can only be set on a
1060         # subsequent request and not on its first cycle (the state
1061         # machine must have advanced), which makes slow_valid
1062         # at least 2 cycles from the previous hit_load_valid.
1063
1064         # Sanity: Only one of these must be set in any given cycle
1065
1066         if False: # TODO: need Display to get this to work
1067             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1068             "unexpected slow_valid collision with stcx_fail"
1069
1070             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1071              "unexpected hit_load_delayed collision with slow_valid"
1072
1073         with m.If(~r1.mmu_req):
1074             # Request came from loadstore1...
1075             # Load hit case is the standard path
1076             with m.If(r1.hit_load_valid):
1077                 sync += Display("completing load hit data=%x", data_out)
1078
1079             # error cases complete without stalling
1080             with m.If(r1.ls_error):
1081                 with m.If(r1.dcbz):
1082                     sync += Display("completing dcbz with error")
1083                 with m.Else():
1084                     sync += Display("completing ld/st with error")
1085
1086             # Slow ops (load miss, NC, stores)
1087             with m.If(r1.slow_valid):
1088                 sync += Display("completing store or load miss adr=%x data=%x",
1089                                 r1.req.real_addr, data_out)
1090
1091         with m.Else():
1092             # Request came from MMU
1093             with m.If(r1.hit_load_valid):
1094                 sync += Display("completing load hit to MMU, data=%x",
1095                                 m_out.data)
1096             # error cases complete without stalling
1097             with m.If(r1.mmu_error):
1098                 sync += Display("combpleting MMU ld with error")
1099
1100             # Slow ops (i.e. load miss)
1101             with m.If(r1.slow_valid):
1102                 sync += Display("completing MMU load miss, adr=%x data=%x",
1103                                 r1.req.real_addr, m_out.data)
1104
1105     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1106         """rams
1107         Generate a cache RAM for each way. This handles the normal
1108         reads, writes from reloads and the special store-hit update
1109         path as well.
1110
1111         Note: the BRAMs have an extra read buffer, meaning the output
1112         is pipelined an extra cycle. This differs from the
1113         icache. The writeback logic needs to take that into
1114         account by using 1-cycle delayed signals for load hits.
1115         """
1116         comb = m.d.comb
1117         bus = self.bus
1118
1119         for i in range(NUM_WAYS):
1120             do_read  = Signal(name="do_rd%d" % i)
1121             rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
1122             do_write = Signal(name="do_wr%d" % i)
1123             wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
1124             wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
1125             wr_sel   = Signal(ROW_SIZE)
1126             wr_sel_m = Signal(ROW_SIZE)
1127             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1128
1129             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1130             setattr(m.submodules, "cacheram_%d" % i, way)
1131
1132             comb += way.rd_en.eq(do_read)
1133             comb += way.rd_addr.eq(rd_addr)
1134             comb += _d_out.eq(way.rd_data_o)
1135             comb += way.wr_sel.eq(wr_sel_m)
1136             comb += way.wr_addr.eq(wr_addr)
1137             comb += way.wr_data.eq(wr_data)
1138
1139             # Cache hit reads
1140             comb += do_read.eq(1)
1141             comb += rd_addr.eq(early_req_row)
1142             with m.If(r1.hit_way == i):
1143                 comb += cache_out_row.eq(_d_out)
1144
1145             # Write mux:
1146             #
1147             # Defaults to wishbone read responses (cache refill)
1148             #
1149             # For timing, the mux on wr_data/sel/addr is not
1150             # dependent on anything other than the current state.
1151
1152             with m.If(r1.write_bram):
1153                 # Write store data to BRAM.  This happens one
1154                 # cycle after the store is in r0.
1155                 comb += wr_data.eq(r1.req.data)
1156                 comb += wr_sel.eq(r1.req.byte_sel)
1157                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1158
1159                 with m.If(i == r1.req.hit_way):
1160                     comb += do_write.eq(1)
1161             with m.Else():
1162                 # Otherwise, we might be doing a reload or a DCBZ
1163                 with m.If(r1.dcbz):
1164                     comb += wr_data.eq(0)
1165                 with m.Else():
1166                     comb += wr_data.eq(bus.dat_r)
1167                 comb += wr_addr.eq(r1.store_row)
1168                 comb += wr_sel.eq(~0) # all 1s
1169
1170                 with m.If((r1.state == State.RELOAD_WAIT_ACK)
1171                           & bus.ack & (replace_way == i)):
1172                     comb += do_write.eq(1)
1173
1174             # Mask write selects with do_write since BRAM
1175             # doesn't have a global write-enable
1176             with m.If(do_write):
1177                 comb += wr_sel_m.eq(wr_sel)
1178
1179     # Cache hit synchronous machine for the easy case.
1180     # This handles load hits.
1181     # It also handles error cases (TLB miss, cache paradox)
1182     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1183                         req_hit_way, req_index, req_tag, access_ok,
1184                         tlb_hit, tlb_hit_way, tlb_req_index):
1185
1186         comb = m.d.comb
1187         sync = m.d.sync
1188
1189         with m.If(req_op != Op.OP_NONE):
1190             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1191                     req_op, r0.req.addr, r0.req.nc,
1192                     req_index, req_tag, req_hit_way)
1193
1194         with m.If(r0_valid):
1195             sync += r1.mmu_req.eq(r0.mmu_req)
1196
1197         # Fast path for load/store hits.
1198         # Set signals for the writeback controls.
1199         sync += r1.hit_way.eq(req_hit_way)
1200         sync += r1.hit_index.eq(req_index)
1201
1202         with m.If(req_op == Op.OP_LOAD_HIT):
1203             sync += r1.hit_load_valid.eq(1)
1204         with m.Else():
1205             sync += r1.hit_load_valid.eq(0)
1206
1207         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1208             sync += r1.cache_hit.eq(1)
1209         with m.Else():
1210             sync += r1.cache_hit.eq(0)
1211
1212         with m.If(req_op == Op.OP_BAD):
1213             sync += Display("Signalling ld/st error "
1214                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1215                             ~r0.mmu_req,r0.mmu_req,access_ok)
1216             sync += r1.ls_error.eq(~r0.mmu_req)
1217             sync += r1.mmu_error.eq(r0.mmu_req)
1218             sync += r1.cache_paradox.eq(access_ok)
1219
1220         with m.Else():
1221             sync += r1.ls_error.eq(0)
1222             sync += r1.mmu_error.eq(0)
1223             sync += r1.cache_paradox.eq(0)
1224
1225         with m.If(req_op == Op.OP_STCX_FAIL):
1226             sync += r1.stcx_fail.eq(1)
1227         with m.Else():
1228             sync += r1.stcx_fail.eq(0)
1229
1230         # Record TLB hit information for updating TLB PLRU
1231         sync += r1.tlb_hit.eq(tlb_hit)
1232         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1233         sync += r1.tlb_hit_index.eq(tlb_req_index)
1234
1235     # Memory accesses are handled by this state machine:
1236     #
1237     #   * Cache load miss/reload (in conjunction with "rams")
1238     #   * Load hits for non-cachable forms
1239     #   * Stores (the collision case is handled in "rams")
1240     #
1241     # All wishbone requests generation is done here.
1242     # This machine operates at stage 1.
1243     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1244                     r0, replace_way,
1245                     req_hit_way, req_same_tag,
1246                     r0_valid, req_op, cache_tags, req_go, ra):
1247
1248         comb = m.d.comb
1249         sync = m.d.sync
1250         bus = self.bus
1251         d_in = self.d_in
1252
1253         req         = MemAccessRequest("mreq_ds")
1254
1255         req_row = Signal(ROW_BITS)
1256         req_idx = Signal(INDEX_BITS)
1257         req_tag = Signal(TAG_BITS)
1258         comb += req_idx.eq(get_index(req.real_addr))
1259         comb += req_row.eq(get_row(req.real_addr))
1260         comb += req_tag.eq(get_tag(req.real_addr))
1261
1262         sync += r1.use_forward1.eq(use_forward1_next)
1263         sync += r1.forward_sel.eq(0)
1264
1265         with m.If(use_forward1_next):
1266             sync += r1.forward_sel.eq(r1.req.byte_sel)
1267         with m.Elif(use_forward2_next):
1268             sync += r1.forward_sel.eq(r1.forward_sel1)
1269
1270         sync += r1.forward_data2.eq(r1.forward_data1)
1271         with m.If(r1.write_bram):
1272             sync += r1.forward_data1.eq(r1.req.data)
1273             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1274             sync += r1.forward_way1.eq(r1.req.hit_way)
1275             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1276             sync += r1.forward_valid1.eq(1)
1277         with m.Else():
1278             with m.If(r1.dcbz):
1279                 sync += r1.forward_data1.eq(0)
1280             with m.Else():
1281                 sync += r1.forward_data1.eq(bus.dat_r)
1282             sync += r1.forward_sel1.eq(~0) # all 1s
1283             sync += r1.forward_way1.eq(replace_way)
1284             sync += r1.forward_row1.eq(r1.store_row)
1285             sync += r1.forward_valid1.eq(0)
1286
1287         # One cycle pulses reset
1288         sync += r1.slow_valid.eq(0)
1289         sync += r1.write_bram.eq(0)
1290         sync += r1.inc_acks.eq(0)
1291         sync += r1.dec_acks.eq(0)
1292
1293         sync += r1.ls_valid.eq(0)
1294         # complete tlbies and TLB loads in the third cycle
1295         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1296
1297         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1298             with m.If(~r0.mmu_req):
1299                 sync += r1.ls_valid.eq(1)
1300             with m.Else():
1301                 sync += r1.mmu_done.eq(1)
1302
1303         with m.If(r1.write_tag):
1304             # Store new tag in selected way
1305             for i in range(NUM_WAYS):
1306                 with m.If(i == replace_way):
1307                     ct = Signal(TAG_RAM_WIDTH)
1308                     comb += ct.eq(cache_tags[r1.store_index].tag)
1309                     """
1310 TODO: check this
1311 cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
1312                     (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
1313                     """
1314                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1315                     sync += cache_tags[r1.store_index].tag.eq(ct)
1316             sync += r1.store_way.eq(replace_way)
1317             sync += r1.write_tag.eq(0)
1318
1319         # Take request from r1.req if there is one there,
1320         # else from req_op, ra, etc.
1321         with m.If(r1.full):
1322             comb += req.eq(r1.req)
1323         with m.Else():
1324             comb += req.op.eq(req_op)
1325             comb += req.valid.eq(req_go)
1326             comb += req.mmu_req.eq(r0.mmu_req)
1327             comb += req.dcbz.eq(r0.req.dcbz)
1328             comb += req.real_addr.eq(ra)
1329
1330             with m.If(r0.req.dcbz):
1331                 # force data to 0 for dcbz
1332                 comb += req.data.eq(0)
1333             with m.Elif(r0.d_valid):
1334                 comb += req.data.eq(r0.req.data)
1335             with m.Else():
1336                 comb += req.data.eq(d_in.data)
1337
1338             # Select all bytes for dcbz
1339             # and for cacheable loads
1340             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1341                 comb += req.byte_sel.eq(~0) # all 1s
1342             with m.Else():
1343                 comb += req.byte_sel.eq(r0.req.byte_sel)
1344             comb += req.hit_way.eq(req_hit_way)
1345             comb += req.same_tag.eq(req_same_tag)
1346
1347             # Store the incoming request from r0,
1348             # if it is a slow request
1349             # Note that r1.full = 1 implies req_op = OP_NONE
1350             with m.If((req_op == Op.OP_LOAD_MISS)
1351                       | (req_op == Op.OP_LOAD_NC)
1352                       | (req_op == Op.OP_STORE_MISS)
1353                       | (req_op == Op.OP_STORE_HIT)):
1354                 sync += r1.req.eq(req)
1355                 sync += r1.full.eq(1)
1356
1357         # Main state machine
1358         with m.Switch(r1.state):
1359
1360             with m.Case(State.IDLE):
1361                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1362                 sync += r1.wb.sel.eq(req.byte_sel)
1363                 sync += r1.wb.dat.eq(req.data)
1364                 sync += r1.dcbz.eq(req.dcbz)
1365
1366                 # Keep track of our index and way
1367                 # for subsequent stores.
1368                 sync += r1.store_index.eq(req_idx)
1369                 sync += r1.store_row.eq(req_row)
1370                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1371                 sync += r1.reload_tag.eq(req_tag)
1372                 sync += r1.req.same_tag.eq(1)
1373
1374                 with m.If(req.op == Op.OP_STORE_HIT):
1375                     sync += r1.store_way.eq(req.hit_way)
1376
1377                 # Reset per-row valid bits,
1378                 # ready for handling OP_LOAD_MISS
1379                 for i in range(ROW_PER_LINE):
1380                     sync += r1.rows_valid[i].eq(0)
1381
1382                 with m.If(req_op != Op.OP_NONE):
1383                     sync += Display("cache op %d", req.op)
1384
1385                 with m.Switch(req.op):
1386                     with m.Case(Op.OP_LOAD_HIT):
1387                         # stay in IDLE state
1388                         pass
1389
1390                     with m.Case(Op.OP_LOAD_MISS):
1391                         sync += Display("cache miss real addr: %x " \
1392                                 "idx: %x tag: %x",
1393                                 req.real_addr, req_row, req_tag)
1394
1395                         # Start the wishbone cycle
1396                         sync += r1.wb.we.eq(0)
1397                         sync += r1.wb.cyc.eq(1)
1398                         sync += r1.wb.stb.eq(1)
1399
1400                         # Track that we had one request sent
1401                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1402                         sync += r1.write_tag.eq(1)
1403
1404                     with m.Case(Op.OP_LOAD_NC):
1405                         sync += r1.wb.cyc.eq(1)
1406                         sync += r1.wb.stb.eq(1)
1407                         sync += r1.wb.we.eq(0)
1408                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1409
1410                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1411                         with m.If(~req.dcbz):
1412                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1413                             sync += r1.acks_pending.eq(1)
1414                             sync += r1.full.eq(0)
1415                             sync += r1.slow_valid.eq(1)
1416
1417                             with m.If(~req.mmu_req):
1418                                 sync += r1.ls_valid.eq(1)
1419                             with m.Else():
1420                                 sync += r1.mmu_done.eq(1)
1421
1422                             with m.If(req.op == Op.OP_STORE_HIT):
1423                                 sync += r1.write_bram.eq(1)
1424                         with m.Else():
1425                             # dcbz is handled much like a load miss except
1426                             # that we are writing to memory instead of reading
1427                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1428
1429                             with m.If(req.op == Op.OP_STORE_MISS):
1430                                 sync += r1.write_tag.eq(1)
1431
1432                         sync += r1.wb.we.eq(1)
1433                         sync += r1.wb.cyc.eq(1)
1434                         sync += r1.wb.stb.eq(1)
1435
1436                     # OP_NONE and OP_BAD do nothing
1437                     # OP_BAD & OP_STCX_FAIL were
1438                     # handled above already
1439                     with m.Case(Op.OP_NONE):
1440                         pass
1441                     with m.Case(Op.OP_BAD):
1442                         pass
1443                     with m.Case(Op.OP_STCX_FAIL):
1444                         pass
1445
1446             with m.Case(State.RELOAD_WAIT_ACK):
1447                 ld_stbs_done = Signal()
1448                 # Requests are all sent if stb is 0
1449                 comb += ld_stbs_done.eq(~r1.wb.stb)
1450
1451                 # If we are still sending requests, was one accepted?
1452                 with m.If((~bus.stall) & r1.wb.stb):
1453                     # That was the last word?  We are done sending.
1454                     # Clear stb and set ld_stbs_done so we can handle an
1455                     # eventual last ack on the same cycle.
1456                     # sigh - reconstruct wb adr with 3 extra 0s at front
1457                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1458                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1459                         sync += r1.wb.stb.eq(0)
1460                         comb += ld_stbs_done.eq(1)
1461
1462                     # Calculate the next row address in the current cache line
1463                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1464                     comb += row.eq(r1.wb.adr)
1465                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1466
1467                 # Incoming acks processing
1468                 sync += r1.forward_valid1.eq(bus.ack)
1469                 with m.If(bus.ack):
1470                     srow = Signal(ROW_LINE_BITS)
1471                     comb += srow.eq(r1.store_row)
1472                     sync += r1.rows_valid[srow].eq(1)
1473
1474                     # If this is the data we were looking for,
1475                     # we can complete the request next cycle.
1476                     # Compare the whole address in case the
1477                     # request in r1.req is not the one that
1478                     # started this refill.
1479                     with m.If(req.valid & r1.req.same_tag &
1480                               ((r1.dcbz & r1.req.dcbz) |
1481                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1482                                 (r1.store_row == get_row(req.real_addr))):
1483                         sync += r1.full.eq(0)
1484                         sync += r1.slow_valid.eq(1)
1485                         with m.If(~r1.mmu_req):
1486                             sync += r1.ls_valid.eq(1)
1487                         with m.Else():
1488                             sync += r1.mmu_done.eq(1)
1489                         sync += r1.forward_sel.eq(~0) # all 1s
1490                         sync += r1.use_forward1.eq(1)
1491
1492                     # Check for completion
1493                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1494                                                       r1.end_row_ix)):
1495                         # Complete wishbone cycle
1496                         sync += r1.wb.cyc.eq(0)
1497
1498                         # Cache line is now valid
1499                         cv = Signal(INDEX_BITS)
1500                         comb += cv.eq(cache_tags[r1.store_index].valid)
1501                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1502                         sync += cache_tags[r1.store_index].valid.eq(cv)
1503
1504                         sync += r1.state.eq(State.IDLE)
1505                         sync += Display("cache valid set %x "
1506                                         "idx %d way %d",
1507                                          cv, r1.store_index, r1.store_way)
1508
1509                     # Increment store row counter
1510                     sync += r1.store_row.eq(next_row(r1.store_row))
1511
1512             with m.Case(State.STORE_WAIT_ACK):
1513                 st_stbs_done = Signal()
1514                 acks        = Signal(3)
1515                 adjust_acks = Signal(3)
1516
1517                 comb += st_stbs_done.eq(~r1.wb.stb)
1518                 comb += acks.eq(r1.acks_pending)
1519
1520                 with m.If(r1.inc_acks != r1.dec_acks):
1521                     with m.If(r1.inc_acks):
1522                         comb += adjust_acks.eq(acks + 1)
1523                     with m.Else():
1524                         comb += adjust_acks.eq(acks - 1)
1525                 with m.Else():
1526                     comb += adjust_acks.eq(acks)
1527
1528                 sync += r1.acks_pending.eq(adjust_acks)
1529
1530                 # Clear stb when slave accepted request
1531                 with m.If(~bus.stall):
1532                     # See if there is another store waiting
1533                     # to be done which is in the same real page.
1534                     with m.If(req.valid):
1535                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1536                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1537                         sync += r1.wb.dat.eq(req.data)
1538                         sync += r1.wb.sel.eq(req.byte_sel)
1539
1540                     with m.If((adjust_acks < 7) & req.same_tag &
1541                                 ((req.op == Op.OP_STORE_MISS)
1542                                  | (req.op == Op.OP_STORE_HIT))):
1543                         sync += r1.wb.stb.eq(1)
1544                         comb += st_stbs_done.eq(0)
1545
1546                         with m.If(req.op == Op.OP_STORE_HIT):
1547                             sync += r1.write_bram.eq(1)
1548                         sync += r1.full.eq(0)
1549                         sync += r1.slow_valid.eq(1)
1550
1551                         # Store requests never come from the MMU
1552                         sync += r1.ls_valid.eq(1)
1553                         comb += st_stbs_done.eq(0)
1554                         sync += r1.inc_acks.eq(1)
1555                     with m.Else():
1556                         sync += r1.wb.stb.eq(0)
1557                         comb += st_stbs_done.eq(1)
1558
1559                 # Got ack ? See if complete.
1560                 with m.If(bus.ack):
1561                     with m.If(st_stbs_done & (adjust_acks == 1)):
1562                         sync += r1.state.eq(State.IDLE)
1563                         sync += r1.wb.cyc.eq(0)
1564                         sync += r1.wb.stb.eq(0)
1565                     sync += r1.dec_acks.eq(1)
1566
1567             with m.Case(State.NC_LOAD_WAIT_ACK):
1568                 # Clear stb when slave accepted request
1569                 with m.If(~bus.stall):
1570                     sync += r1.wb.stb.eq(0)
1571
1572                 # Got ack ? complete.
1573                 with m.If(bus.ack):
1574                     sync += r1.state.eq(State.IDLE)
1575                     sync += r1.full.eq(0)
1576                     sync += r1.slow_valid.eq(1)
1577
1578                     with m.If(~r1.mmu_req):
1579                         sync += r1.ls_valid.eq(1)
1580                     with m.Else():
1581                         sync += r1.mmu_done.eq(1)
1582
1583                     sync += r1.forward_sel.eq(~0) # all 1s
1584                     sync += r1.use_forward1.eq(1)
1585                     sync += r1.wb.cyc.eq(0)
1586                     sync += r1.wb.stb.eq(0)
1587
1588     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1589
1590         sync = m.d.sync
1591         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1592
1593         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1594                                stall_out, req_op[:3], d_out.valid, d_out.error,
1595                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1596                                r1.real_adr[3:6]))
1597
1598     def elaborate(self, platform):
1599
1600         m = Module()
1601         comb = m.d.comb
1602         d_in = self.d_in
1603
1604         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1605         cache_tags       = CacheTagArray()
1606         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1607
1608         # TODO attribute ram_style : string;
1609         # TODO attribute ram_style of cache_tags : signal is "distributed";
1610
1611         """note: these are passed to nmigen.hdl.Memory as "attributes".
1612            don't know how, just that they are.
1613         """
1614         dtlb            = TLBArray()
1615         # TODO attribute ram_style of
1616         #  dtlb_tags : signal is "distributed";
1617         # TODO attribute ram_style of
1618         #  dtlb_ptes : signal is "distributed";
1619
1620         r0      = RegStage0("r0")
1621         r0_full = Signal()
1622
1623         r1 = RegStage1("r1")
1624
1625         reservation = Reservation()
1626
1627         # Async signals on incoming request
1628         req_index    = Signal(INDEX_BITS)
1629         req_row      = Signal(ROW_BITS)
1630         req_hit_way  = Signal(WAY_BITS)
1631         req_tag      = Signal(TAG_BITS)
1632         req_op       = Signal(Op)
1633         req_data     = Signal(64)
1634         req_same_tag = Signal()
1635         req_go       = Signal()
1636
1637         early_req_row     = Signal(ROW_BITS)
1638
1639         cancel_store      = Signal()
1640         set_rsrv          = Signal()
1641         clear_rsrv        = Signal()
1642
1643         r0_valid          = Signal()
1644         r0_stall          = Signal()
1645
1646         use_forward1_next = Signal()
1647         use_forward2_next = Signal()
1648
1649         cache_out_row     = Signal(WB_DATA_BITS)
1650
1651         plru_victim       = PLRUOut()
1652         replace_way       = Signal(WAY_BITS)
1653
1654         # Wishbone read/write/cache write formatting signals
1655         bus_sel           = Signal(8)
1656
1657         # TLB signals
1658         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1659         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1660         tlb_valid_way = Signal(TLB_NUM_WAYS)
1661         tlb_req_index = Signal(TLB_SET_BITS)
1662         tlb_hit       = Signal()
1663         tlb_hit_way   = Signal(TLB_WAY_BITS)
1664         pte           = Signal(TLB_PTE_BITS)
1665         ra            = Signal(REAL_ADDR_BITS)
1666         valid_ra      = Signal()
1667         perm_attr     = PermAttr("dc_perms")
1668         rc_ok         = Signal()
1669         perm_ok       = Signal()
1670         access_ok     = Signal()
1671
1672         tlb_plru_victim = TLBPLRUOut()
1673
1674         # we don't yet handle collisions between loadstore1 requests
1675         # and MMU requests
1676         comb += self.m_out.stall.eq(0)
1677
1678         # Hold off the request in r0 when r1 has an uncompleted request
1679         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1680         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1681         comb += self.stall_out.eq(r0_stall)
1682
1683
1684         # deal with litex not doing wishbone pipeline mode
1685         # XXX in wrong way.  FIFOs are needed in the SRAM test
1686         # so that stb/ack match up. same thing done in icache.py
1687         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1688
1689         # Wire up wishbone request latch out of stage 1
1690         comb += self.bus.we.eq(r1.wb.we)
1691         comb += self.bus.adr.eq(r1.wb.adr)
1692         comb += self.bus.sel.eq(r1.wb.sel)
1693         comb += self.bus.stb.eq(r1.wb.stb)
1694         comb += self.bus.dat_w.eq(r1.wb.dat)
1695         comb += self.bus.cyc.eq(r1.wb.cyc)
1696
1697         # call sub-functions putting everything together, using shared
1698         # signals established above
1699         self.stage_0(m, r0, r1, r0_full)
1700         self.tlb_read(m, r0_stall, tlb_valid_way,
1701                       tlb_tag_way, tlb_pte_way, dtlb)
1702         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1703                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1704                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1705         self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
1706                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1707                         tlb_pte_way)
1708         self.maybe_plrus(m, r1, plru_victim)
1709         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1710         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1711         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1712                            r0_valid, r1, cache_tags, replace_way,
1713                            use_forward1_next, use_forward2_next,
1714                            req_hit_way, plru_victim, rc_ok, perm_attr,
1715                            valid_ra, perm_ok, access_ok, req_op, req_go,
1716                            tlb_pte_way,
1717                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1718                            cancel_store, req_same_tag, r0_stall, early_req_row)
1719         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1720                            r0_valid, r0, reservation)
1721         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1722                            reservation, r0)
1723         self.writeback_control(m, r1, cache_out_row)
1724         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1725         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1726                         req_hit_way, req_index, req_tag, access_ok,
1727                         tlb_hit, tlb_hit_way, tlb_req_index)
1728         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1729                     r0, replace_way,
1730                     req_hit_way, req_same_tag,
1731                          r0_valid, req_op, cache_tags, req_go, ra)
1732         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1733
1734         return m
1735
1736
1737 if __name__ == '__main__':
1738     dut = DCache()
1739     vl = rtlil.convert(dut, ports=[])
1740     with open("test_dcache.il", "w") as f:
1741         f.write(vl)