src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record)
  29 from nmutil.util import Display
  30 from nmigen.lib.coding import Decoder
  31
  32 from copy import deepcopy
  33 from random import randint, seed
  34
  35 from nmigen_soc.wishbone.bus import Interface
  36
  37 from nmigen.cli import main
  38 from nmutil.iocontrol import RecordObject
  39 from nmigen.utils import log2_int
  40 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  41                                      DCacheToLoadStore1Type,
  42                                      MMUToDCacheType,
  43                                      DCacheToMMUType)
  44
  45 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  46                                 WBAddrType, WBDataType, WBSelType,
  47                                 WBMasterOut, WBSlaveOut,
  48                                 WBMasterOutVector, WBSlaveOutVector,
  49                                 WBIOMasterOut, WBIOSlaveOut)
  50
  51 from soc.experiment.cache_ram import CacheRam
  52 #from soc.experiment.plru import PLRU
  53 from nmutil.plru import PLRU, PLRUs
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmigen.cli import rtlil
  59
  60 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  61 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  62 from nmutil.sim_tmp_alternative import Simulator
  63
  64 from nmutil.util import wrap
  65
  66
  67 # TODO: make these parameters of DCache at some point
  68 LINE_SIZE = 64    # Line size in bytes
  69 NUM_LINES = 16    # Number of lines in a set
  70 NUM_WAYS = 4      # Number of ways
  71 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  72 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  73 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  74 LOG_LENGTH = 0    # Non-zero to enable log data collection
  75
  76 # BRAM organisation: We never access more than
  77 #     -- WB_DATA_BITS at a time so to save
  78 #     -- resources we make the array only that wide, and
  79 #     -- use consecutive indices to make a cache "line"
  80 #     --
  81 #     -- ROW_SIZE is the width in bytes of the BRAM
  82 #     -- (based on WB, so 64-bits)
  83 ROW_SIZE = WB_DATA_BITS // 8;
  84
  85 # ROW_PER_LINE is the number of row (wishbone
  86 # transactions) in a line
  87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  88
  89 # BRAM_ROWS is the number of rows in BRAM needed
  90 # to represent the full dcache
  91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  92
  93 print ("ROW_SIZE", ROW_SIZE)
  94 print ("ROW_PER_LINE", ROW_PER_LINE)
  95 print ("BRAM_ROWS", BRAM_ROWS)
  96 print ("NUM_WAYS", NUM_WAYS)
  97
  98 # Bit fields counts in the address
  99
 100 # REAL_ADDR_BITS is the number of real address
 101 # bits that we store
 102 REAL_ADDR_BITS = 56
 103
 104 # ROW_BITS is the number of bits to select a row
 105 ROW_BITS = log2_int(BRAM_ROWS)
 106
 107 # ROW_LINE_BITS is the number of bits to select
 108 # a row within a line
 109 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 110
 111 # LINE_OFF_BITS is the number of bits for
 112 # the offset in a cache line
 113 LINE_OFF_BITS = log2_int(LINE_SIZE)
 114
 115 # ROW_OFF_BITS is the number of bits for
 116 # the offset in a row
 117 ROW_OFF_BITS = log2_int(ROW_SIZE)
 118
 119 # INDEX_BITS is the number if bits to
 120 # select a cache line
 121 INDEX_BITS = log2_int(NUM_LINES)
 122
 123 # SET_SIZE_BITS is the log base 2 of the set size
 124 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 125
 126 # TAG_BITS is the number of bits of
 127 # the tag part of the address
 128 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 129
 130 # TAG_WIDTH is the width in bits of each way of the tag RAM
 131 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 132
 133 # WAY_BITS is the number of bits to select a way
 134 WAY_BITS = log2_int(NUM_WAYS)
 135
 136 # Example of layout for 32 lines of 64 bytes:
 137 layout = """\
 138   ..  tag    |index|  line  |
 139   ..         |   row   |    |
 140   ..         |     |---|    | ROW_LINE_BITS  (3)
 141   ..         |     |--- - --| LINE_OFF_BITS (6)
 142   ..         |         |- --| ROW_OFF_BITS  (3)
 143   ..         |----- ---|    | ROW_BITS      (8)
 144   ..         |-----|        | INDEX_BITS    (5)
 145   .. --------|              | TAG_BITS      (45)
 146 """
 147 print (layout)
 148 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 149             (TAG_BITS, INDEX_BITS, ROW_BITS,
 150              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 151 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 152 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 153 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 154
 155 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 156
 157 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 158
 159 def CacheTagArray():
 160     tag_layout = [('valid', 1),
 161                   ('tag', TAG_RAM_WIDTH),
 162                  ]
 163     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 164
 165 def RowPerLineValidArray():
 166     return Array(Signal(name="rows_valid%d" % x) \
 167                         for x in range(ROW_PER_LINE))
 168
 169 # L1 TLB
 170 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 171 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 172 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 173 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 174 TLB_PTE_BITS     = 64
 175 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 176
 177 def ispow2(x):
 178     return (1<<log2_int(x, False)) == x
 179
 180 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 181 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 182 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 183 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 184 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 185 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 186         "geometry bits don't add up"
 187 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 188         "geometry bits don't add up"
 189 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 190          "geometry bits don't add up"
 191 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 192 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 193
 194 def TLBHit(name):
 195     return Record([('valid', 1),
 196                    ('way', TLB_WAY_BITS)], name=name)
 197
 198 def TLBTagEAArray():
 199     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 200                 for x in range (TLB_NUM_WAYS))
 201
 202 def TLBRecord(name):
 203     tlb_layout = [('valid', TLB_NUM_WAYS),
 204                   ('tag', TLB_TAG_WAY_BITS),
 205                   ('pte', TLB_PTE_WAY_BITS)
 206                  ]
 207     return Record(tlb_layout, name=name)
 208
 209 def TLBArray():
 210     return Array(TLBRecord(name="tlb%d" % x) for x in range(TLB_SET_SIZE))
 211
 212 def HitWaySet():
 213     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 214                         for x in range(TLB_NUM_WAYS))
 215
 216 # Cache RAM interface
 217 def CacheRamOut():
 218     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 219                  for x in range(NUM_WAYS))
 220
 221 # PLRU output interface
 222 def PLRUOut():
 223     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 224                 for x in range(NUM_LINES))
 225
 226 # TLB PLRU output interface
 227 def TLBPLRUOut():
 228     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 229                 for x in range(TLB_SET_SIZE))
 230
 231 # Helper functions to decode incoming requests
 232 #
 233 # Return the cache line index (tag index) for an address
 234 def get_index(addr):
 235     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 236
 237 # Return the cache row index (data memory) for an address
 238 def get_row(addr):
 239     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 240
 241 # Return the index of a row within a line
 242 def get_row_of_line(row):
 243     return row[:ROW_BITS][:ROW_LINE_BITS]
 244
 245 # Returns whether this is the last row of a line
 246 def is_last_row_addr(addr, last):
 247     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 248
 249 # Returns whether this is the last row of a line
 250 def is_last_row(row, last):
 251     return get_row_of_line(row) == last
 252
 253 # Return the next row in the current cache line. We use a
 254 # dedicated function in order to limit the size of the
 255 # generated adder to be only the bits within a cache line
 256 # (3 bits with default settings)
 257 def next_row(row):
 258     row_v = row[0:ROW_LINE_BITS] + 1
 259     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 260
 261 # Get the tag value from the address
 262 def get_tag(addr):
 263     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 264
 265 # Read a tag from a tag memory row
 266 def read_tag(way, tagset):
 267     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 268
 269 # Read a TLB tag from a TLB tag memory row
 270 def read_tlb_tag(way, tags):
 271     return tags.word_select(way, TLB_EA_TAG_BITS)
 272
 273 # Write a TLB tag to a TLB tag memory row
 274 def write_tlb_tag(way, tags, tag):
 275     return read_tlb_tag(way, tags).eq(tag)
 276
 277 # Read a PTE from a TLB PTE memory row
 278 def read_tlb_pte(way, ptes):
 279     return ptes.word_select(way, TLB_PTE_BITS)
 280
 281 def write_tlb_pte(way, ptes, newpte):
 282     return read_tlb_pte(way, ptes).eq(newpte)
 283
 284
 285 # Record for storing permission, attribute, etc. bits from a PTE
 286 class PermAttr(RecordObject):
 287     def __init__(self, name=None):
 288         super().__init__(name=name)
 289         self.reference = Signal()
 290         self.changed   = Signal()
 291         self.nocache   = Signal()
 292         self.priv      = Signal()
 293         self.rd_perm   = Signal()
 294         self.wr_perm   = Signal()
 295
 296
 297 def extract_perm_attr(pte):
 298     pa = PermAttr()
 299     return pa;
 300
 301
 302 # Type of operation on a "valid" input
 303 @unique
 304 class Op(Enum):
 305     OP_NONE       = 0
 306     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 307     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 308     OP_LOAD_HIT   = 3 # Cache hit on load
 309     OP_LOAD_MISS  = 4 # Load missing cache
 310     OP_LOAD_NC    = 5 # Non-cachable load
 311     OP_STORE_HIT  = 6 # Store hitting cache
 312     OP_STORE_MISS = 7 # Store missing cache
 313
 314
 315 # Cache state machine
 316 @unique
 317 class State(Enum):
 318     IDLE             = 0 # Normal load hit processing
 319     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 320     STORE_WAIT_ACK   = 2 # Store wait ack
 321     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 322
 323
 324 # Dcache operations:
 325 #
 326 # In order to make timing, we use the BRAMs with
 327 # an output buffer, which means that the BRAM
 328 # output is delayed by an extra cycle.
 329 #
 330 # Thus, the dcache has a 2-stage internal pipeline
 331 # for cache hits with no stalls.
 332 #
 333 # All other operations are handled via stalling
 334 # in the first stage.
 335 #
 336 # The second stage can thus complete a hit at the same
 337 # time as the first stage emits a stall for a complex op.
 338 #
 339 # Stage 0 register, basically contains just the latched request
 340
 341 class RegStage0(RecordObject):
 342     def __init__(self, name=None):
 343         super().__init__(name=name)
 344         self.req     = LoadStore1ToDCacheType(name="lsmem")
 345         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 346         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 347         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 348         self.mmu_req = Signal() # indicates source of request
 349         self.d_valid = Signal() # indicates req.data is valid now
 350
 351
 352 class MemAccessRequest(RecordObject):
 353     def __init__(self, name=None):
 354         super().__init__(name=name)
 355         self.op        = Signal(Op)
 356         self.valid     = Signal()
 357         self.dcbz      = Signal()
 358         self.real_addr = Signal(REAL_ADDR_BITS)
 359         self.data      = Signal(64)
 360         self.byte_sel  = Signal(8)
 361         self.hit_way   = Signal(WAY_BITS)
 362         self.same_tag  = Signal()
 363         self.mmu_req   = Signal()
 364
 365
 366 # First stage register, contains state for stage 1 of load hits
 367 # and for the state machine used by all other operations
 368 class RegStage1(RecordObject):
 369     def __init__(self, name=None):
 370         super().__init__(name=name)
 371         # Info about the request
 372         self.full             = Signal() # have uncompleted request
 373         self.mmu_req          = Signal() # request is from MMU
 374         self.req              = MemAccessRequest(name="reqmem")
 375
 376         # Cache hit state
 377         self.hit_way          = Signal(WAY_BITS)
 378         self.hit_load_valid   = Signal()
 379         self.hit_index        = Signal(INDEX_BITS)
 380         self.cache_hit        = Signal()
 381
 382         # TLB hit state
 383         self.tlb_hit          = TLBHit("tlb_hit")
 384         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 385
 386         # 2-stage data buffer for data forwarded from writes to reads
 387         self.forward_data1    = Signal(64)
 388         self.forward_data2    = Signal(64)
 389         self.forward_sel1     = Signal(8)
 390         self.forward_valid1   = Signal()
 391         self.forward_way1     = Signal(WAY_BITS)
 392         self.forward_row1     = Signal(ROW_BITS)
 393         self.use_forward1     = Signal()
 394         self.forward_sel      = Signal(8)
 395
 396         # Cache miss state (reload state machine)
 397         self.state            = Signal(State)
 398         self.dcbz             = Signal()
 399         self.write_bram       = Signal()
 400         self.write_tag        = Signal()
 401         self.slow_valid       = Signal()
 402         self.wb               = WBMasterOut("wb")
 403         self.reload_tag       = Signal(TAG_BITS)
 404         self.store_way        = Signal(WAY_BITS)
 405         self.store_row        = Signal(ROW_BITS)
 406         self.store_index      = Signal(INDEX_BITS)
 407         self.end_row_ix       = Signal(ROW_LINE_BITS)
 408         self.rows_valid       = RowPerLineValidArray()
 409         self.acks_pending     = Signal(3)
 410         self.inc_acks         = Signal()
 411         self.dec_acks         = Signal()
 412
 413         # Signals to complete (possibly with error)
 414         self.ls_valid         = Signal()
 415         self.ls_error         = Signal()
 416         self.mmu_done         = Signal()
 417         self.mmu_error        = Signal()
 418         self.cache_paradox    = Signal()
 419
 420         # Signal to complete a failed stcx.
 421         self.stcx_fail        = Signal()
 422
 423
 424 # Reservation information
 425 class Reservation(RecordObject):
 426     def __init__(self):
 427         super().__init__()
 428         self.valid = Signal()
 429         self.addr  = Signal(64-LINE_OFF_BITS)
 430
 431
 432 class DTLBUpdate(Elaboratable):
 433     def __init__(self):
 434         self.dtlb     = TLBArray()
 435         self.tlbie    = Signal()
 436         self.tlbwe    = Signal()
 437         self.doall    = Signal()
 438         self.updated  = Signal()
 439         self.v_updated  = Signal()
 440         self.tlb_hit     = TLBHit("tlb_hit")
 441         self.tlb_req_index = Signal(TLB_SET_BITS)
 442
 443         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 444         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 445         self.repl_way        = Signal(TLB_WAY_BITS)
 446         self.eatag           = Signal(TLB_EA_TAG_BITS)
 447         self.pte_data        = Signal(TLB_PTE_BITS)
 448
 449         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 450
 451         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 452         self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 453         self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 454
 455         # read from dtlb array
 456         self.tlb_read       = Signal()
 457         self.tlb_read_index = Signal(TLB_SET_BITS)
 458         self.tlb_way        = TLBRecord("o_tlb_way")
 459
 460     def elaborate(self, platform):
 461         m = Module()
 462         comb = m.d.comb
 463         sync = m.d.sync
 464
 465         tagset   = Signal(TLB_TAG_WAY_BITS)
 466         pteset   = Signal(TLB_PTE_WAY_BITS)
 467
 468         dtlb, tlb_req_index = self.dtlb, self.tlb_req_index
 469         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 470         comb += db_out.eq(self.dv)
 471
 472         with m.If(self.tlbie & self.doall):
 473             pass # clear all back in parent
 474         with m.Elif(self.tlbie):
 475             with m.If(self.tlb_hit.valid):
 476                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 477                 comb += self.v_updated.eq(1)
 478
 479         with m.Elif(self.tlbwe):
 480
 481             comb += tagset.eq(self.tlb_tag_way)
 482             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 483             comb += tb_out.eq(tagset)
 484
 485             comb += pteset.eq(self.tlb_pte_way)
 486             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 487             comb += pb_out.eq(pteset)
 488
 489             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 490
 491             comb += self.updated.eq(1)
 492             comb += self.v_updated.eq(1)
 493
 494         with m.If(self.tlbie & self.doall):
 495             # clear all valid bits at once
 496             for i in range(TLB_SET_SIZE):
 497                 sync += dtlb[i].valid.eq(0)
 498         with m.If(self.updated):
 499             sync += dtlb[tlb_req_index].tag.eq(self.tb_out)
 500             sync += dtlb[tlb_req_index].pte.eq(self.pb_out)
 501         with m.If(self.v_updated):
 502             sync += dtlb[tlb_req_index].valid.eq(self.db_out)
 503
 504         comb += self.dv.eq(dtlb[tlb_req_index].valid)
 505
 506         # select one TLB way
 507         with m.If(self.tlb_read):
 508             sync += self.tlb_way.eq(dtlb[self.tlb_read_index])
 509
 510         return m
 511
 512
 513 class DCachePendingHit(Elaboratable):
 514
 515     def __init__(self, tlb_way,
 516                       cache_i_validdx, cache_tag_set,
 517                     req_addr,
 518                     hit_set):
 519
 520         self.go          = Signal()
 521         self.virt_mode   = Signal()
 522         self.is_hit      = Signal()
 523         self.tlb_hit      = TLBHit("tlb_hit")
 524         self.hit_way     = Signal(WAY_BITS)
 525         self.rel_match   = Signal()
 526         self.req_index   = Signal(INDEX_BITS)
 527         self.reload_tag  = Signal(TAG_BITS)
 528
 529         self.tlb_way = tlb_way
 530         self.cache_i_validdx = cache_i_validdx
 531         self.cache_tag_set = cache_tag_set
 532         self.req_addr = req_addr
 533         self.hit_set = hit_set
 534
 535     def elaborate(self, platform):
 536         m = Module()
 537         comb = m.d.comb
 538         sync = m.d.sync
 539
 540         go = self.go
 541         virt_mode = self.virt_mode
 542         is_hit = self.is_hit
 543         tlb_way = self.tlb_way
 544         cache_i_validdx = self.cache_i_validdx
 545         cache_tag_set = self.cache_tag_set
 546         req_addr = self.req_addr
 547         tlb_hit = self.tlb_hit
 548         hit_set = self.hit_set
 549         hit_way = self.hit_way
 550         rel_match = self.rel_match
 551         req_index = self.req_index
 552         reload_tag = self.reload_tag
 553
 554         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 555                                     for i in range(TLB_NUM_WAYS))
 556         hit_way_set = HitWaySet()
 557
 558         # Test if pending request is a hit on any way
 559         # In order to make timing in virtual mode,
 560         # when we are using the TLB, we compare each
 561         # way with each of the real addresses from each way of
 562         # the TLB, and then decide later which match to use.
 563
 564         with m.If(virt_mode):
 565             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 566                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 567                 s_hit       = Signal()
 568                 s_pte       = Signal(TLB_PTE_BITS)
 569                 s_ra        = Signal(REAL_ADDR_BITS)
 570                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 571                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 572                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 573                 comb += s_tag.eq(get_tag(s_ra))
 574
 575                 for i in range(NUM_WAYS): # way_t
 576                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 577                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 578                                   (read_tag(i, cache_tag_set) == s_tag)
 579                                   & (tlb_way.valid[j]))
 580                     with m.If(is_tag_hit):
 581                         comb += hit_way_set[j].eq(i)
 582                         comb += s_hit.eq(1)
 583                 comb += hit_set[j].eq(s_hit)
 584                 with m.If(s_tag == reload_tag):
 585                     comb += rel_matches[j].eq(1)
 586             with m.If(tlb_hit.valid):
 587                 comb += is_hit.eq(hit_set[tlb_hit.way])
 588                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 589                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 590         with m.Else():
 591             s_tag       = Signal(TAG_BITS)
 592             comb += s_tag.eq(get_tag(req_addr))
 593             for i in range(NUM_WAYS): # way_t
 594                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 595                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 596                           (read_tag(i, cache_tag_set) == s_tag))
 597                 with m.If(is_tag_hit):
 598                     comb += hit_way.eq(i)
 599                     comb += is_hit.eq(1)
 600             with m.If(s_tag == reload_tag):
 601                 comb += rel_match.eq(1)
 602
 603         return m
 604
 605
 606 class DCache(Elaboratable):
 607     """Set associative dcache write-through
 608
 609     TODO (in no specific order):
 610     * See list in icache.vhdl
 611     * Complete load misses on the cycle when WB data comes instead of
 612       at the end of line (this requires dealing with requests coming in
 613       while not idle...)
 614     """
 615     def __init__(self):
 616         self.d_in      = LoadStore1ToDCacheType("d_in")
 617         self.d_out     = DCacheToLoadStore1Type("d_out")
 618
 619         self.m_in      = MMUToDCacheType("m_in")
 620         self.m_out     = DCacheToMMUType("m_out")
 621
 622         self.stall_out = Signal()
 623
 624         # standard naming (wired to non-standard for compatibility)
 625         self.bus = Interface(addr_width=32,
 626                             data_width=64,
 627                             granularity=8,
 628                             features={'stall'},
 629                             alignment=0,
 630                             name="dcache")
 631
 632         self.log_out   = Signal(20)
 633
 634     def stage_0(self, m, r0, r1, r0_full):
 635         """Latch the request in r0.req as long as we're not stalling
 636         """
 637         comb = m.d.comb
 638         sync = m.d.sync
 639         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 640
 641         r = RegStage0("stage0")
 642
 643         # TODO, this goes in unit tests and formal proofs
 644         with m.If(d_in.valid & m_in.valid):
 645             sync += Display("request collision loadstore vs MMU")
 646
 647         with m.If(m_in.valid):
 648             comb += r.req.valid.eq(1)
 649             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 650             comb += r.req.dcbz.eq(0)
 651             comb += r.req.nc.eq(0)
 652             comb += r.req.reserve.eq(0)
 653             comb += r.req.virt_mode.eq(0)
 654             comb += r.req.priv_mode.eq(1)
 655             comb += r.req.addr.eq(m_in.addr)
 656             comb += r.req.data.eq(m_in.pte)
 657             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 658             comb += r.tlbie.eq(m_in.tlbie)
 659             comb += r.doall.eq(m_in.doall)
 660             comb += r.tlbld.eq(m_in.tlbld)
 661             comb += r.mmu_req.eq(1)
 662             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 663                                  m_in.addr, m_in.pte, r.req.load)
 664
 665         with m.Else():
 666             comb += r.req.eq(d_in)
 667             comb += r.req.data.eq(0)
 668             comb += r.tlbie.eq(0)
 669             comb += r.doall.eq(0)
 670             comb += r.tlbld.eq(0)
 671             comb += r.mmu_req.eq(0)
 672         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 673             sync += r0.eq(r)
 674             sync += r0_full.eq(r.req.valid)
 675             # Sample data the cycle after a request comes in from loadstore1.
 676             # If another request has come in already then the data will get
 677             # put directly into req.data below.
 678             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 679                      ~r0.mmu_req):
 680                 sync += r0.req.data.eq(d_in.data)
 681                 sync += r0.d_valid.eq(1)
 682         with m.If(d_in.valid):
 683             m.d.sync += Display("    DCACHE req cache "
 684                                 "virt %d addr %x data %x ld %d",
 685                                  r.req.virt_mode, r.req.addr,
 686                                  r.req.data, r.req.load)
 687
 688     def tlb_read(self, m, r0_stall, tlb_way, dtlb):
 689         """TLB
 690         Operates in the second cycle on the request latched in r0.req.
 691         TLB updates write the entry at the end of the second cycle.
 692         """
 693         comb = m.d.comb
 694         sync = m.d.sync
 695         m_in, d_in = self.m_in, self.d_in
 696
 697         addrbits = Signal(TLB_SET_BITS)
 698
 699         amin = TLB_LG_PGSZ
 700         amax = TLB_LG_PGSZ + TLB_SET_BITS
 701
 702         with m.If(m_in.valid):
 703             comb += addrbits.eq(m_in.addr[amin : amax])
 704         with m.Else():
 705             comb += addrbits.eq(d_in.addr[amin : amax])
 706
 707         # If we have any op and the previous op isn't finished,
 708         # then keep the same output for next cycle.
 709         d = self.dtlb_update
 710         comb += d.tlb_read_index.eq(addrbits)
 711         comb += d.tlb_read.eq(~r0_stall)
 712         comb += tlb_way.eq(d.tlb_way)
 713
 714     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 715         """Generate TLB PLRUs
 716         """
 717         comb = m.d.comb
 718         sync = m.d.sync
 719
 720         if TLB_NUM_WAYS == 0:
 721             return
 722
 723         # Binary-to-Unary one-hot, enabled by tlb_hit valid
 724         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 725         m.submodules.tlb_plrus = tlb_plrus
 726         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 727         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 728         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 729         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 730         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 731
 732     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 733                    tlb_way,
 734                    pte, tlb_hit, valid_ra, perm_attr, ra):
 735
 736         comb = m.d.comb
 737
 738         hitway = Signal(TLB_WAY_BITS)
 739         hit    = Signal()
 740         eatag  = Signal(TLB_EA_TAG_BITS)
 741
 742         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 743         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 744         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 745
 746         for i in range(TLB_NUM_WAYS):
 747             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 748             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 749             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 750             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 751             with m.If(is_tag_hit):
 752                 comb += hitway.eq(i)
 753                 comb += hit.eq(1)
 754
 755         comb += tlb_hit.valid.eq(hit & r0_valid)
 756         comb += tlb_hit.way.eq(hitway)
 757
 758         with m.If(tlb_hit.valid):
 759             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 760         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 761
 762         with m.If(r0.req.virt_mode):
 763             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 764                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 765                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 766             comb += perm_attr.reference.eq(pte[8])
 767             comb += perm_attr.changed.eq(pte[7])
 768             comb += perm_attr.nocache.eq(pte[5])
 769             comb += perm_attr.priv.eq(pte[3])
 770             comb += perm_attr.rd_perm.eq(pte[2])
 771             comb += perm_attr.wr_perm.eq(pte[1])
 772         with m.Else():
 773             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 774                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 775             comb += perm_attr.reference.eq(1)
 776             comb += perm_attr.changed.eq(1)
 777             comb += perm_attr.nocache.eq(0)
 778             comb += perm_attr.priv.eq(1)
 779             comb += perm_attr.rd_perm.eq(1)
 780             comb += perm_attr.wr_perm.eq(1)
 781
 782         with m.If(valid_ra):
 783             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 784                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 785             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 786             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 787             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 788             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 789             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 790             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 791
 792     def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
 793                     tlb_hit, tlb_plru_victim, tlb_way):
 794
 795         comb = m.d.comb
 796         sync = m.d.sync
 797
 798         tlbie    = Signal()
 799         tlbwe    = Signal()
 800
 801         comb += tlbie.eq(r0_valid & r0.tlbie)
 802         comb += tlbwe.eq(r0_valid & r0.tlbld)
 803
 804         d = self.dtlb_update
 805
 806         comb += d.tlbie.eq(tlbie)
 807         comb += d.tlbwe.eq(tlbwe)
 808         comb += d.doall.eq(r0.doall)
 809         comb += d.tlb_hit.eq(tlb_hit)
 810         comb += d.tlb_tag_way.eq(tlb_way.tag)
 811         comb += d.tlb_pte_way.eq(tlb_way.pte)
 812         comb += d.tlb_req_index.eq(tlb_req_index)
 813
 814         with m.If(tlb_hit.valid):
 815             comb += d.repl_way.eq(tlb_hit.way)
 816         with m.Else():
 817             comb += d.repl_way.eq(tlb_plru_victim)
 818         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 819         comb += d.pte_data.eq(r0.req.data)
 820
 821     def maybe_plrus(self, m, r1, plru_victim):
 822         """Generate PLRUs
 823         """
 824         comb = m.d.comb
 825         sync = m.d.sync
 826
 827         if TLB_NUM_WAYS == 0:
 828             return
 829
 830         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 831         comb += plrus.way.eq(r1.hit_way)
 832         comb += plrus.valid.eq(r1.cache_hit)
 833         comb += plrus.index.eq(r1.hit_index)
 834         comb += plrus.isel.eq(r1.store_index) # select victim
 835         comb += plru_victim.eq(plrus.o_index) # selected victim
 836
 837     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 838         """Cache tag RAM read port
 839         """
 840         comb = m.d.comb
 841         sync = m.d.sync
 842         m_in, d_in = self.m_in, self.d_in
 843
 844         index = Signal(INDEX_BITS)
 845
 846         with m.If(r0_stall):
 847             comb += index.eq(req_index)
 848         with m.Elif(m_in.valid):
 849             comb += index.eq(get_index(m_in.addr))
 850         with m.Else():
 851             comb += index.eq(get_index(d_in.addr))
 852         sync += cache_tag_set.eq(cache_tags[index].tag)
 853
 854     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 855                        r0_valid, r1, cache_tags, replace_way,
 856                        use_forward1_next, use_forward2_next,
 857                        req_hit_way, plru_victim, rc_ok, perm_attr,
 858                        valid_ra, perm_ok, access_ok, req_op, req_go,
 859                        tlb_hit, tlb_way, cache_tag_set,
 860                        cancel_store, req_same_tag, r0_stall, early_req_row):
 861         """Cache request parsing and hit detection
 862         """
 863
 864         comb = m.d.comb
 865         m_in, d_in = self.m_in, self.d_in
 866
 867         is_hit      = Signal()
 868         hit_way     = Signal(WAY_BITS)
 869         op          = Signal(Op)
 870         opsel       = Signal(3)
 871         go          = Signal()
 872         nc          = Signal()
 873         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 874                                   for i in range(TLB_NUM_WAYS))
 875         cache_i_validdx = Signal(NUM_WAYS)
 876
 877         # Extract line, row and tag from request
 878         comb += req_index.eq(get_index(r0.req.addr))
 879         comb += req_row.eq(get_row(r0.req.addr))
 880         comb += req_tag.eq(get_tag(ra))
 881
 882         if False: # display on comb is a bit... busy.
 883             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 884                     r0.req.addr, ra, req_index, req_tag, req_row)
 885
 886         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 887         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 888
 889         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 890                                             cache_i_validdx, cache_tag_set,
 891                                             r0.req.addr,
 892                                             hit_set)
 893         comb += dc.tlb_hit.eq(tlb_hit)
 894         comb += dc.reload_tag.eq(r1.reload_tag)
 895         comb += dc.virt_mode.eq(r0.req.virt_mode)
 896         comb += dc.go.eq(go)
 897         comb += dc.req_index.eq(req_index)
 898
 899         comb += is_hit.eq(dc.is_hit)
 900         comb += hit_way.eq(dc.hit_way)
 901         comb += req_same_tag.eq(dc.rel_match)
 902
 903         # See if the request matches the line currently being reloaded
 904         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 905                   (req_index == r1.store_index) & req_same_tag):
 906             # For a store, consider this a hit even if the row isn't
 907             # valid since it will be by the time we perform the store.
 908             # For a load, check the appropriate row valid bit.
 909             rrow = Signal(ROW_LINE_BITS)
 910             comb += rrow.eq(req_row)
 911             valid = r1.rows_valid[rrow]
 912             comb += is_hit.eq((~r0.req.load) | valid)
 913             comb += hit_way.eq(replace_way)
 914
 915         # Whether to use forwarded data for a load or not
 916         with m.If((get_row(r1.req.real_addr) == req_row) &
 917                   (r1.req.hit_way == hit_way)):
 918             # Only need to consider r1.write_bram here, since if we
 919             # are writing refill data here, then we don't have a
 920             # cache hit this cycle on the line being refilled.
 921             # (There is the possibility that the load following the
 922             # load miss that started the refill could be to the old
 923             # contents of the victim line, since it is a couple of
 924             # cycles after the refill starts before we see the updated
 925             # cache tag. In that case we don't use the bypass.)
 926             comb += use_forward1_next.eq(r1.write_bram)
 927         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 928             comb += use_forward2_next.eq(r1.forward_valid1)
 929
 930         # The way that matched on a hit
 931         comb += req_hit_way.eq(hit_way)
 932
 933         # The way to replace on a miss
 934         with m.If(r1.write_tag):
 935             comb += replace_way.eq(plru_victim)
 936         with m.Else():
 937             comb += replace_way.eq(r1.store_way)
 938
 939         # work out whether we have permission for this access
 940         # NB we don't yet implement AMR, thus no KUAP
 941         comb += rc_ok.eq(perm_attr.reference
 942                          & (r0.req.load | perm_attr.changed))
 943         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 944                            (perm_attr.wr_perm |
 945                               (r0.req.load & perm_attr.rd_perm)))
 946         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 947
 948         # Combine the request and cache hit status to decide what
 949         # operation needs to be done
 950         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 951         comb += op.eq(Op.OP_NONE)
 952         with m.If(go):
 953             with m.If(~access_ok):
 954                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
 955                                  valid_ra, perm_ok, rc_ok)
 956                 comb += op.eq(Op.OP_BAD)
 957             with m.Elif(cancel_store):
 958                 m.d.sync += Display("DCACHE cancel store")
 959                 comb += op.eq(Op.OP_STCX_FAIL)
 960             with m.Else():
 961                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
 962                                  valid_ra, nc, r0.req.load)
 963                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 964                 with m.Switch(opsel):
 965                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 966                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 967                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 968                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 969                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 970                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 971                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 972                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 973         comb += req_op.eq(op)
 974         comb += req_go.eq(go)
 975
 976         # Version of the row number that is valid one cycle earlier
 977         # in the cases where we need to read the cache data BRAM.
 978         # If we're stalling then we need to keep reading the last
 979         # row requested.
 980         with m.If(~r0_stall):
 981             with m.If(m_in.valid):
 982                 comb += early_req_row.eq(get_row(m_in.addr))
 983             with m.Else():
 984                 comb += early_req_row.eq(get_row(d_in.addr))
 985         with m.Else():
 986             comb += early_req_row.eq(req_row)
 987
 988     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 989                          r0_valid, r0, reservation):
 990         """Handle load-with-reservation and store-conditional instructions
 991         """
 992         comb = m.d.comb
 993
 994         with m.If(r0_valid & r0.req.reserve):
 995             # XXX generate alignment interrupt if address
 996             # is not aligned XXX or if r0.req.nc = '1'
 997             with m.If(r0.req.load):
 998                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
 999             with m.Else():
1000                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1001                 with m.If((~reservation.valid) |
1002                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1003                     comb += cancel_store.eq(1)
1004
1005     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1006                         reservation, r0):
1007         comb = m.d.comb
1008         sync = m.d.sync
1009
1010         with m.If(r0_valid & access_ok):
1011             with m.If(clear_rsrv):
1012                 sync += reservation.valid.eq(0)
1013             with m.Elif(set_rsrv):
1014                 sync += reservation.valid.eq(1)
1015                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1016
1017     def writeback_control(self, m, r1, cache_out_row):
1018         """Return data for loads & completion control logic
1019         """
1020         comb = m.d.comb
1021         sync = m.d.sync
1022         d_out, m_out = self.d_out, self.m_out
1023
1024         data_out = Signal(64)
1025         data_fwd = Signal(64)
1026
1027         # Use the bypass if are reading the row that was
1028         # written 1 or 2 cycles ago, including for the
1029         # slow_valid = 1 case (i.e. completing a load
1030         # miss or a non-cacheable load).
1031         with m.If(r1.use_forward1):
1032             comb += data_fwd.eq(r1.forward_data1)
1033         with m.Else():
1034             comb += data_fwd.eq(r1.forward_data2)
1035
1036         comb += data_out.eq(cache_out_row)
1037
1038         for i in range(8):
1039             with m.If(r1.forward_sel[i]):
1040                 dsel = data_fwd.word_select(i, 8)
1041                 comb += data_out.word_select(i, 8).eq(dsel)
1042
1043         # DCache output to LoadStore
1044         comb += d_out.valid.eq(r1.ls_valid)
1045         comb += d_out.data.eq(data_out)
1046         comb += d_out.store_done.eq(~r1.stcx_fail)
1047         comb += d_out.error.eq(r1.ls_error)
1048         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1049
1050         # Outputs to MMU
1051         comb += m_out.done.eq(r1.mmu_done)
1052         comb += m_out.err.eq(r1.mmu_error)
1053         comb += m_out.data.eq(data_out)
1054
1055         # We have a valid load or store hit or we just completed
1056         # a slow op such as a load miss, a NC load or a store
1057         #
1058         # Note: the load hit is delayed by one cycle. However it
1059         # can still not collide with r.slow_valid (well unless I
1060         # miscalculated) because slow_valid can only be set on a
1061         # subsequent request and not on its first cycle (the state
1062         # machine must have advanced), which makes slow_valid
1063         # at least 2 cycles from the previous hit_load_valid.
1064
1065         # Sanity: Only one of these must be set in any given cycle
1066
1067         if False: # TODO: need Display to get this to work
1068             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1069             "unexpected slow_valid collision with stcx_fail"
1070
1071             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1072              "unexpected hit_load_delayed collision with slow_valid"
1073
1074         with m.If(~r1.mmu_req):
1075             # Request came from loadstore1...
1076             # Load hit case is the standard path
1077             with m.If(r1.hit_load_valid):
1078                 sync += Display("completing load hit data=%x", data_out)
1079
1080             # error cases complete without stalling
1081             with m.If(r1.ls_error):
1082                 with m.If(r1.dcbz):
1083                     sync += Display("completing dcbz with error")
1084                 with m.Else():
1085                     sync += Display("completing ld/st with error")
1086
1087             # Slow ops (load miss, NC, stores)
1088             with m.If(r1.slow_valid):
1089                 sync += Display("completing store or load miss adr=%x data=%x",
1090                                 r1.req.real_addr, data_out)
1091
1092         with m.Else():
1093             # Request came from MMU
1094             with m.If(r1.hit_load_valid):
1095                 sync += Display("completing load hit to MMU, data=%x",
1096                                 m_out.data)
1097             # error cases complete without stalling
1098             with m.If(r1.mmu_error):
1099                 sync += Display("combpleting MMU ld with error")
1100
1101             # Slow ops (i.e. load miss)
1102             with m.If(r1.slow_valid):
1103                 sync += Display("completing MMU load miss, adr=%x data=%x",
1104                                 r1.req.real_addr, m_out.data)
1105
1106     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1107         """rams
1108         Generate a cache RAM for each way. This handles the normal
1109         reads, writes from reloads and the special store-hit update
1110         path as well.
1111
1112         Note: the BRAMs have an extra read buffer, meaning the output
1113         is pipelined an extra cycle. This differs from the
1114         icache. The writeback logic needs to take that into
1115         account by using 1-cycle delayed signals for load hits.
1116         """
1117         comb = m.d.comb
1118         bus = self.bus
1119
1120         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1121         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1122         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1123         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1124                    ~r1.write_bram))
1125         comb += rwe.i.eq(replace_way)
1126
1127         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1128         comb += hwe.i.eq(r1.hit_way)
1129
1130         # this one is gated with write_bram, and replace_way_e can never be
1131         # set at the same time.  that means that do_write can OR the outputs
1132         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1133         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1134         comb += hre.i.eq(r1.req.hit_way)
1135
1136         # common Signals
1137         do_read  = Signal()
1138         wr_addr  = Signal(ROW_BITS)
1139         wr_data  = Signal(WB_DATA_BITS)
1140         wr_sel   = Signal(ROW_SIZE)
1141         rd_addr  = Signal(ROW_BITS)
1142
1143         comb += do_read.eq(1) # always enable
1144         comb += rd_addr.eq(early_req_row)
1145
1146         # Write mux:
1147         #
1148         # Defaults to wishbone read responses (cache refill)
1149         #
1150         # For timing, the mux on wr_data/sel/addr is not
1151         # dependent on anything other than the current state.
1152
1153         with m.If(r1.write_bram):
1154             # Write store data to BRAM.  This happens one
1155             # cycle after the store is in r0.
1156             comb += wr_data.eq(r1.req.data)
1157             comb += wr_sel.eq(r1.req.byte_sel)
1158             comb += wr_addr.eq(get_row(r1.req.real_addr))
1159
1160         with m.Else():
1161             # Otherwise, we might be doing a reload or a DCBZ
1162             with m.If(r1.dcbz):
1163                 comb += wr_data.eq(0)
1164             with m.Else():
1165                 comb += wr_data.eq(bus.dat_r)
1166             comb += wr_addr.eq(r1.store_row)
1167             comb += wr_sel.eq(~0) # all 1s
1168
1169         # set up Cache Rams
1170         for i in range(NUM_WAYS):
1171             do_write = Signal(name="do_wr%d" % i)
1172             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1173             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1174
1175             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1176             setattr(m.submodules, "cacheram_%d" % i, way)
1177
1178             comb += way.rd_en.eq(do_read)
1179             comb += way.rd_addr.eq(rd_addr)
1180             comb += d_out.eq(way.rd_data_o)
1181             comb += way.wr_sel.eq(wr_sel_m)
1182             comb += way.wr_addr.eq(wr_addr)
1183             comb += way.wr_data.eq(wr_data)
1184
1185             # Cache hit reads
1186             with m.If(hwe.o[i]):
1187                 comb += cache_out_row.eq(d_out)
1188
1189             # these are mutually-exclusive via their Decoder-enablers
1190             # (note: Decoder-enable is inverted)
1191             comb += do_write.eq(hre.o[i] | rwe.o[i])
1192
1193             # Mask write selects with do_write since BRAM
1194             # doesn't have a global write-enable
1195             with m.If(do_write):
1196                 comb += wr_sel_m.eq(wr_sel)
1197
1198     # Cache hit synchronous machine for the easy case.
1199     # This handles load hits.
1200     # It also handles error cases (TLB miss, cache paradox)
1201     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1202                         req_hit_way, req_index, req_tag, access_ok,
1203                         tlb_hit, tlb_req_index):
1204         comb = m.d.comb
1205         sync = m.d.sync
1206
1207         with m.If(req_op != Op.OP_NONE):
1208             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1209                     req_op, r0.req.addr, r0.req.nc,
1210                     req_index, req_tag, req_hit_way)
1211
1212         with m.If(r0_valid):
1213             sync += r1.mmu_req.eq(r0.mmu_req)
1214
1215         # Fast path for load/store hits.
1216         # Set signals for the writeback controls.
1217         sync += r1.hit_way.eq(req_hit_way)
1218         sync += r1.hit_index.eq(req_index)
1219
1220         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1221         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1222                                 (req_op == Op.OP_STORE_HIT))
1223
1224         with m.If(req_op == Op.OP_BAD):
1225             sync += Display("Signalling ld/st error "
1226                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1227                             ~r0.mmu_req,r0.mmu_req,access_ok)
1228             sync += r1.ls_error.eq(~r0.mmu_req)
1229             sync += r1.mmu_error.eq(r0.mmu_req)
1230             sync += r1.cache_paradox.eq(access_ok)
1231         with m.Else():
1232             sync += r1.ls_error.eq(0)
1233             sync += r1.mmu_error.eq(0)
1234             sync += r1.cache_paradox.eq(0)
1235
1236         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1237
1238         # Record TLB hit information for updating TLB PLRU
1239         sync += r1.tlb_hit.eq(tlb_hit)
1240         sync += r1.tlb_hit_index.eq(tlb_req_index)
1241
1242     # Memory accesses are handled by this state machine:
1243     #
1244     #   * Cache load miss/reload (in conjunction with "rams")
1245     #   * Load hits for non-cachable forms
1246     #   * Stores (the collision case is handled in "rams")
1247     #
1248     # All wishbone requests generation is done here.
1249     # This machine operates at stage 1.
1250     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1251                     r0, replace_way,
1252                     req_hit_way, req_same_tag,
1253                     r0_valid, req_op, cache_tags, req_go, ra):
1254
1255         comb = m.d.comb
1256         sync = m.d.sync
1257         bus = self.bus
1258         d_in = self.d_in
1259
1260         req         = MemAccessRequest("mreq_ds")
1261
1262         req_row = Signal(ROW_BITS)
1263         req_idx = Signal(INDEX_BITS)
1264         req_tag = Signal(TAG_BITS)
1265         comb += req_idx.eq(get_index(req.real_addr))
1266         comb += req_row.eq(get_row(req.real_addr))
1267         comb += req_tag.eq(get_tag(req.real_addr))
1268
1269         sync += r1.use_forward1.eq(use_forward1_next)
1270         sync += r1.forward_sel.eq(0)
1271
1272         with m.If(use_forward1_next):
1273             sync += r1.forward_sel.eq(r1.req.byte_sel)
1274         with m.Elif(use_forward2_next):
1275             sync += r1.forward_sel.eq(r1.forward_sel1)
1276
1277         sync += r1.forward_data2.eq(r1.forward_data1)
1278         with m.If(r1.write_bram):
1279             sync += r1.forward_data1.eq(r1.req.data)
1280             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1281             sync += r1.forward_way1.eq(r1.req.hit_way)
1282             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1283             sync += r1.forward_valid1.eq(1)
1284         with m.Else():
1285             with m.If(r1.dcbz):
1286                 sync += r1.forward_data1.eq(0)
1287             with m.Else():
1288                 sync += r1.forward_data1.eq(bus.dat_r)
1289             sync += r1.forward_sel1.eq(~0) # all 1s
1290             sync += r1.forward_way1.eq(replace_way)
1291             sync += r1.forward_row1.eq(r1.store_row)
1292             sync += r1.forward_valid1.eq(0)
1293
1294         # One cycle pulses reset
1295         sync += r1.slow_valid.eq(0)
1296         sync += r1.write_bram.eq(0)
1297         sync += r1.inc_acks.eq(0)
1298         sync += r1.dec_acks.eq(0)
1299
1300         sync += r1.ls_valid.eq(0)
1301         # complete tlbies and TLB loads in the third cycle
1302         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1303
1304         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1305             with m.If(~r0.mmu_req):
1306                 sync += r1.ls_valid.eq(1)
1307             with m.Else():
1308                 sync += r1.mmu_done.eq(1)
1309
1310         with m.If(r1.write_tag):
1311             # Store new tag in selected way
1312             replace_way_onehot = Signal(NUM_WAYS)
1313             comb += replace_way_onehot.eq(1<<replace_way)
1314             for i in range(NUM_WAYS):
1315                 with m.If(replace_way_onehot[i]):
1316                     ct = Signal(TAG_RAM_WIDTH)
1317                     comb += ct.eq(cache_tags[r1.store_index].tag)
1318                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1319                     sync += cache_tags[r1.store_index].tag.eq(ct)
1320             sync += r1.store_way.eq(replace_way)
1321             sync += r1.write_tag.eq(0)
1322
1323         # Take request from r1.req if there is one there,
1324         # else from req_op, ra, etc.
1325         with m.If(r1.full):
1326             comb += req.eq(r1.req)
1327         with m.Else():
1328             comb += req.op.eq(req_op)
1329             comb += req.valid.eq(req_go)
1330             comb += req.mmu_req.eq(r0.mmu_req)
1331             comb += req.dcbz.eq(r0.req.dcbz)
1332             comb += req.real_addr.eq(ra)
1333
1334             with m.If(r0.req.dcbz):
1335                 # force data to 0 for dcbz
1336                 comb += req.data.eq(0)
1337             with m.Elif(r0.d_valid):
1338                 comb += req.data.eq(r0.req.data)
1339             with m.Else():
1340                 comb += req.data.eq(d_in.data)
1341
1342             # Select all bytes for dcbz
1343             # and for cacheable loads
1344             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1345                 comb += req.byte_sel.eq(~0) # all 1s
1346             with m.Else():
1347                 comb += req.byte_sel.eq(r0.req.byte_sel)
1348             comb += req.hit_way.eq(req_hit_way)
1349             comb += req.same_tag.eq(req_same_tag)
1350
1351             # Store the incoming request from r0,
1352             # if it is a slow request
1353             # Note that r1.full = 1 implies req_op = OP_NONE
1354             with m.If((req_op == Op.OP_LOAD_MISS)
1355                       | (req_op == Op.OP_LOAD_NC)
1356                       | (req_op == Op.OP_STORE_MISS)
1357                       | (req_op == Op.OP_STORE_HIT)):
1358                 sync += r1.req.eq(req)
1359                 sync += r1.full.eq(1)
1360
1361         # Main state machine
1362         with m.Switch(r1.state):
1363
1364             with m.Case(State.IDLE):
1365                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1366                 sync += r1.wb.sel.eq(req.byte_sel)
1367                 sync += r1.wb.dat.eq(req.data)
1368                 sync += r1.dcbz.eq(req.dcbz)
1369
1370                 # Keep track of our index and way
1371                 # for subsequent stores.
1372                 sync += r1.store_index.eq(req_idx)
1373                 sync += r1.store_row.eq(req_row)
1374                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1375                 sync += r1.reload_tag.eq(req_tag)
1376                 sync += r1.req.same_tag.eq(1)
1377
1378                 with m.If(req.op == Op.OP_STORE_HIT):
1379                     sync += r1.store_way.eq(req.hit_way)
1380
1381                 # Reset per-row valid bits,
1382                 # ready for handling OP_LOAD_MISS
1383                 for i in range(ROW_PER_LINE):
1384                     sync += r1.rows_valid[i].eq(0)
1385
1386                 with m.If(req_op != Op.OP_NONE):
1387                     sync += Display("cache op %d", req.op)
1388
1389                 with m.Switch(req.op):
1390                     with m.Case(Op.OP_LOAD_HIT):
1391                         # stay in IDLE state
1392                         pass
1393
1394                     with m.Case(Op.OP_LOAD_MISS):
1395                         sync += Display("cache miss real addr: %x " \
1396                                 "idx: %x tag: %x",
1397                                 req.real_addr, req_row, req_tag)
1398
1399                         # Start the wishbone cycle
1400                         sync += r1.wb.we.eq(0)
1401                         sync += r1.wb.cyc.eq(1)
1402                         sync += r1.wb.stb.eq(1)
1403
1404                         # Track that we had one request sent
1405                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1406                         sync += r1.write_tag.eq(1)
1407
1408                     with m.Case(Op.OP_LOAD_NC):
1409                         sync += r1.wb.cyc.eq(1)
1410                         sync += r1.wb.stb.eq(1)
1411                         sync += r1.wb.we.eq(0)
1412                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1413
1414                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1415                         with m.If(~req.dcbz):
1416                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1417                             sync += r1.acks_pending.eq(1)
1418                             sync += r1.full.eq(0)
1419                             sync += r1.slow_valid.eq(1)
1420
1421                             with m.If(~req.mmu_req):
1422                                 sync += r1.ls_valid.eq(1)
1423                             with m.Else():
1424                                 sync += r1.mmu_done.eq(1)
1425
1426                             with m.If(req.op == Op.OP_STORE_HIT):
1427                                 sync += r1.write_bram.eq(1)
1428                         with m.Else():
1429                             # dcbz is handled much like a load miss except
1430                             # that we are writing to memory instead of reading
1431                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1432
1433                             with m.If(req.op == Op.OP_STORE_MISS):
1434                                 sync += r1.write_tag.eq(1)
1435
1436                         sync += r1.wb.we.eq(1)
1437                         sync += r1.wb.cyc.eq(1)
1438                         sync += r1.wb.stb.eq(1)
1439
1440                     # OP_NONE and OP_BAD do nothing
1441                     # OP_BAD & OP_STCX_FAIL were
1442                     # handled above already
1443                     with m.Case(Op.OP_NONE):
1444                         pass
1445                     with m.Case(Op.OP_BAD):
1446                         pass
1447                     with m.Case(Op.OP_STCX_FAIL):
1448                         pass
1449
1450             with m.Case(State.RELOAD_WAIT_ACK):
1451                 ld_stbs_done = Signal()
1452                 # Requests are all sent if stb is 0
1453                 comb += ld_stbs_done.eq(~r1.wb.stb)
1454
1455                 # If we are still sending requests, was one accepted?
1456                 with m.If((~bus.stall) & r1.wb.stb):
1457                     # That was the last word?  We are done sending.
1458                     # Clear stb and set ld_stbs_done so we can handle an
1459                     # eventual last ack on the same cycle.
1460                     # sigh - reconstruct wb adr with 3 extra 0s at front
1461                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1462                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1463                         sync += r1.wb.stb.eq(0)
1464                         comb += ld_stbs_done.eq(1)
1465
1466                     # Calculate the next row address in the current cache line
1467                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1468                     comb += row.eq(r1.wb.adr)
1469                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1470
1471                 # Incoming acks processing
1472                 sync += r1.forward_valid1.eq(bus.ack)
1473                 with m.If(bus.ack):
1474                     srow = Signal(ROW_LINE_BITS)
1475                     comb += srow.eq(r1.store_row)
1476                     sync += r1.rows_valid[srow].eq(1)
1477
1478                     # If this is the data we were looking for,
1479                     # we can complete the request next cycle.
1480                     # Compare the whole address in case the
1481                     # request in r1.req is not the one that
1482                     # started this refill.
1483                     with m.If(req.valid & r1.req.same_tag &
1484                               ((r1.dcbz & r1.req.dcbz) |
1485                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1486                                 (r1.store_row == get_row(req.real_addr))):
1487                         sync += r1.full.eq(0)
1488                         sync += r1.slow_valid.eq(1)
1489                         with m.If(~r1.mmu_req):
1490                             sync += r1.ls_valid.eq(1)
1491                         with m.Else():
1492                             sync += r1.mmu_done.eq(1)
1493                         sync += r1.forward_sel.eq(~0) # all 1s
1494                         sync += r1.use_forward1.eq(1)
1495
1496                     # Check for completion
1497                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1498                                                       r1.end_row_ix)):
1499                         # Complete wishbone cycle
1500                         sync += r1.wb.cyc.eq(0)
1501
1502                         # Cache line is now valid
1503                         cv = Signal(INDEX_BITS)
1504                         comb += cv.eq(cache_tags[r1.store_index].valid)
1505                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1506                         sync += cache_tags[r1.store_index].valid.eq(cv)
1507
1508                         sync += r1.state.eq(State.IDLE)
1509                         sync += Display("cache valid set %x "
1510                                         "idx %d way %d",
1511                                          cv, r1.store_index, r1.store_way)
1512
1513                     # Increment store row counter
1514                     sync += r1.store_row.eq(next_row(r1.store_row))
1515
1516             with m.Case(State.STORE_WAIT_ACK):
1517                 st_stbs_done = Signal()
1518                 acks        = Signal(3)
1519                 adjust_acks = Signal(3)
1520
1521                 comb += st_stbs_done.eq(~r1.wb.stb)
1522                 comb += acks.eq(r1.acks_pending)
1523
1524                 with m.If(r1.inc_acks != r1.dec_acks):
1525                     with m.If(r1.inc_acks):
1526                         comb += adjust_acks.eq(acks + 1)
1527                     with m.Else():
1528                         comb += adjust_acks.eq(acks - 1)
1529                 with m.Else():
1530                     comb += adjust_acks.eq(acks)
1531
1532                 sync += r1.acks_pending.eq(adjust_acks)
1533
1534                 # Clear stb when slave accepted request
1535                 with m.If(~bus.stall):
1536                     # See if there is another store waiting
1537                     # to be done which is in the same real page.
1538                     with m.If(req.valid):
1539                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1540                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1541                         sync += r1.wb.dat.eq(req.data)
1542                         sync += r1.wb.sel.eq(req.byte_sel)
1543
1544                     with m.If((adjust_acks < 7) & req.same_tag &
1545                                 ((req.op == Op.OP_STORE_MISS)
1546                                  | (req.op == Op.OP_STORE_HIT))):
1547                         sync += r1.wb.stb.eq(1)
1548                         comb += st_stbs_done.eq(0)
1549
1550                         with m.If(req.op == Op.OP_STORE_HIT):
1551                             sync += r1.write_bram.eq(1)
1552                         sync += r1.full.eq(0)
1553                         sync += r1.slow_valid.eq(1)
1554
1555                         # Store requests never come from the MMU
1556                         sync += r1.ls_valid.eq(1)
1557                         comb += st_stbs_done.eq(0)
1558                         sync += r1.inc_acks.eq(1)
1559                     with m.Else():
1560                         sync += r1.wb.stb.eq(0)
1561                         comb += st_stbs_done.eq(1)
1562
1563                 # Got ack ? See if complete.
1564                 with m.If(bus.ack):
1565                     with m.If(st_stbs_done & (adjust_acks == 1)):
1566                         sync += r1.state.eq(State.IDLE)
1567                         sync += r1.wb.cyc.eq(0)
1568                         sync += r1.wb.stb.eq(0)
1569                     sync += r1.dec_acks.eq(1)
1570
1571             with m.Case(State.NC_LOAD_WAIT_ACK):
1572                 # Clear stb when slave accepted request
1573                 with m.If(~bus.stall):
1574                     sync += r1.wb.stb.eq(0)
1575
1576                 # Got ack ? complete.
1577                 with m.If(bus.ack):
1578                     sync += r1.state.eq(State.IDLE)
1579                     sync += r1.full.eq(0)
1580                     sync += r1.slow_valid.eq(1)
1581
1582                     with m.If(~r1.mmu_req):
1583                         sync += r1.ls_valid.eq(1)
1584                     with m.Else():
1585                         sync += r1.mmu_done.eq(1)
1586
1587                     sync += r1.forward_sel.eq(~0) # all 1s
1588                     sync += r1.use_forward1.eq(1)
1589                     sync += r1.wb.cyc.eq(0)
1590                     sync += r1.wb.stb.eq(0)
1591
1592     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1593
1594         sync = m.d.sync
1595         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1596
1597         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1598                                stall_out, req_op[:3], d_out.valid, d_out.error,
1599                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1600                                r1.real_adr[3:6]))
1601
1602     def elaborate(self, platform):
1603
1604         m = Module()
1605         comb = m.d.comb
1606         d_in = self.d_in
1607
1608         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1609         cache_tags       = CacheTagArray()
1610         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1611
1612         # TODO attribute ram_style : string;
1613         # TODO attribute ram_style of cache_tags : signal is "distributed";
1614
1615         """note: these are passed to nmigen.hdl.Memory as "attributes".
1616            don't know how, just that they are.
1617         """
1618         # TODO attribute ram_style of
1619         #  dtlb_tags : signal is "distributed";
1620         # TODO attribute ram_style of
1621         #  dtlb_ptes : signal is "distributed";
1622
1623         r0      = RegStage0("r0")
1624         r0_full = Signal()
1625
1626         r1 = RegStage1("r1")
1627
1628         reservation = Reservation()
1629
1630         # Async signals on incoming request
1631         req_index    = Signal(INDEX_BITS)
1632         req_row      = Signal(ROW_BITS)
1633         req_hit_way  = Signal(WAY_BITS)
1634         req_tag      = Signal(TAG_BITS)
1635         req_op       = Signal(Op)
1636         req_data     = Signal(64)
1637         req_same_tag = Signal()
1638         req_go       = Signal()
1639
1640         early_req_row     = Signal(ROW_BITS)
1641
1642         cancel_store      = Signal()
1643         set_rsrv          = Signal()
1644         clear_rsrv        = Signal()
1645
1646         r0_valid          = Signal()
1647         r0_stall          = Signal()
1648
1649         use_forward1_next = Signal()
1650         use_forward2_next = Signal()
1651
1652         cache_out_row     = Signal(WB_DATA_BITS)
1653
1654         plru_victim       = Signal(WAY_BITS)
1655         replace_way       = Signal(WAY_BITS)
1656
1657         # Wishbone read/write/cache write formatting signals
1658         bus_sel           = Signal(8)
1659
1660         # TLB signals
1661         tlb_way       = TLBRecord("tlb_way")
1662         tlb_req_index = Signal(TLB_SET_BITS)
1663         tlb_hit       = TLBHit("tlb_hit")
1664         pte           = Signal(TLB_PTE_BITS)
1665         ra            = Signal(REAL_ADDR_BITS)
1666         valid_ra      = Signal()
1667         perm_attr     = PermAttr("dc_perms")
1668         rc_ok         = Signal()
1669         perm_ok       = Signal()
1670         access_ok     = Signal()
1671
1672         tlb_plru_victim = Signal(TLB_WAY_BITS)
1673
1674         # we don't yet handle collisions between loadstore1 requests
1675         # and MMU requests
1676         comb += self.m_out.stall.eq(0)
1677
1678         # Hold off the request in r0 when r1 has an uncompleted request
1679         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1680         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1681         comb += self.stall_out.eq(r0_stall)
1682
1683         # deal with litex not doing wishbone pipeline mode
1684         # XXX in wrong way.  FIFOs are needed in the SRAM test
1685         # so that stb/ack match up. same thing done in icache.py
1686         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1687
1688         # Wire up wishbone request latch out of stage 1
1689         comb += self.bus.we.eq(r1.wb.we)
1690         comb += self.bus.adr.eq(r1.wb.adr)
1691         comb += self.bus.sel.eq(r1.wb.sel)
1692         comb += self.bus.stb.eq(r1.wb.stb)
1693         comb += self.bus.dat_w.eq(r1.wb.dat)
1694         comb += self.bus.cyc.eq(r1.wb.cyc)
1695
1696         # create submodule TLBUpdate
1697         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1698         dtlb = self.dtlb_update.dtlb
1699
1700         # call sub-functions putting everything together, using shared
1701         # signals established above
1702         self.stage_0(m, r0, r1, r0_full)
1703         self.tlb_read(m, r0_stall, tlb_way, dtlb)
1704         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1705                         tlb_way,
1706                         pte, tlb_hit, valid_ra, perm_attr, ra)
1707         self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
1708                         tlb_hit, tlb_plru_victim,
1709                         tlb_way)
1710         self.maybe_plrus(m, r1, plru_victim)
1711         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1712         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1713         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1714                            r0_valid, r1, cache_tags, replace_way,
1715                            use_forward1_next, use_forward2_next,
1716                            req_hit_way, plru_victim, rc_ok, perm_attr,
1717                            valid_ra, perm_ok, access_ok, req_op, req_go,
1718                            tlb_hit, tlb_way, cache_tag_set,
1719                            cancel_store, req_same_tag, r0_stall, early_req_row)
1720         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1721                            r0_valid, r0, reservation)
1722         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1723                            reservation, r0)
1724         self.writeback_control(m, r1, cache_out_row)
1725         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1726         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1727                         req_hit_way, req_index, req_tag, access_ok,
1728                         tlb_hit, tlb_req_index)
1729         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1730                     r0, replace_way,
1731                     req_hit_way, req_same_tag,
1732                          r0_valid, req_op, cache_tags, req_go, ra)
1733         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1734
1735         return m
1736
1737
1738 if __name__ == '__main__':
1739     dut = DCache()
1740     vl = rtlil.convert(dut, ports=[])
1741     with open("test_dcache.il", "w") as f:
1742         f.write(vl)