src/soc/experiment/dcache.py

   1 #!/usr/bin/env python3
   2 #
   3 # Copyright (C) 2020,2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
   4 # Copyright (C) 2020 Cole Poirier
   5 # Copyright (C) 2020,2021 Cesar Strauss
   6 # Copyright (C) 2021 Tobias Platen
   7 #
   8 # Original dcache.vhdl Copyright of its authors and licensed
   9 # by IBM under CC-BY 4.0
  10 # https://github.com/antonblanchard/microwatt
  11 #
  12 # Conversion to nmigen funded by NLnet and NGI POINTER under EU Grants
  13 # 871528 and 957073, under the LGPL-v3+ License
  14
  15 """DCache
  16
  17 based on Anton Blanchard microwatt dcache.vhdl
  18
  19 note that the microwatt dcache wishbone interface expects "stall".
  20 for simplicity at the moment this is hard-coded to cyc & ~ack.
  21 see WB4 spec, p84, section 5.2.1
  22
  23 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  24 is raised.  sigh
  25
  26 Links:
  27
  28 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  29 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  30 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  31   (discussion about brams for ECP5)
  32
  33 """
  34
  35 import sys
  36
  37 from nmutil.gtkw import write_gtkw
  38
  39 sys.setrecursionlimit(1000000)
  40
  41 from enum import Enum, unique
  42
  43 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  44                     Record, Memory)
  45 from nmutil.util import Display
  46 from nmigen.lib.coding import Decoder
  47
  48 from copy import deepcopy
  49 from random import randint, seed
  50
  51 from nmigen_soc.wishbone.bus import Interface
  52
  53 from nmigen.cli import main
  54 from nmutil.iocontrol import RecordObject
  55 from nmigen.utils import log2_int
  56 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  57                                      DCacheToLoadStore1Type,
  58                                      MMUToDCacheType,
  59                                      DCacheToMMUType)
  60
  61 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  62                                 WBAddrType, WBDataType, WBSelType,
  63                                 WBMasterOut, WBSlaveOut,
  64                                 WBMasterOutVector, WBSlaveOutVector,
  65                                 WBIOMasterOut, WBIOSlaveOut)
  66
  67 from soc.experiment.cache_ram import CacheRam
  68 from soc.experiment.plru import PLRU, PLRUs
  69 #from nmutil.plru import PLRU, PLRUs
  70
  71 # for test
  72 from soc.bus.sram import SRAM
  73 from nmigen import Memory
  74 from nmigen.cli import rtlil
  75
  76 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  77 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  78 from nmutil.sim_tmp_alternative import Simulator
  79
  80 from nmutil.util import wrap
  81
  82 LOG_LENGTH = 0    # Non-zero to enable log data collection
  83
  84 def ispow2(x):
  85     return (1<<log2_int(x, False)) == x
  86
  87
  88 class DCacheConfig:
  89     def __init__(self, LINE_SIZE = 64,    # Line size in bytes
  90                        NUM_LINES = 64,    # Number of lines in a set
  91                        NUM_WAYS = 2,      # Number of ways
  92                        TLB_SET_SIZE = 64, # L1 DTLB entries per set
  93                        TLB_NUM_WAYS = 2,  # L1 DTLB number of sets
  94                        TLB_LG_PGSZ = 12): # L1 DTLB log_2(page_size)
  95         self.LINE_SIZE = LINE_SIZE
  96         self.NUM_LINES = NUM_LINES
  97         self.NUM_WAYS = NUM_WAYS
  98         self.TLB_SET_SIZE = TLB_SET_SIZE
  99         self.TLB_NUM_WAYS = TLB_NUM_WAYS
 100         self.TLB_LG_PGSZ = TLB_LG_PGSZ
 101
 102         # BRAM organisation: We never access more than
 103         #     -- WB_DATA_BITS at a time so to save
 104         #     -- resources we make the array only that wide, and
 105         #     -- use consecutive indices to make a cache "line"
 106         #     --
 107         #     -- ROW_SIZE is the width in bytes of the BRAM
 108         #     -- (based on WB, so 64-bits)
 109         self.ROW_SIZE = WB_DATA_BITS // 8;
 110
 111         # ROW_PER_LINE is the number of row (wishbone
 112         # transactions) in a line
 113         self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
 114
 115         # BRAM_ROWS is the number of rows in BRAM needed
 116         # to represent the full dcache
 117         self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE
 118
 119         print ("ROW_SIZE", self.ROW_SIZE)
 120         print ("ROW_PER_LINE", self.ROW_PER_LINE)
 121         print ("BRAM_ROWS", self.BRAM_ROWS)
 122         print ("NUM_WAYS", self.NUM_WAYS)
 123
 124         # Bit fields counts in the address
 125
 126         # REAL_ADDR_BITS is the number of real address
 127         # bits that we store
 128         self.REAL_ADDR_BITS = 56
 129
 130         # ROW_BITS is the number of bits to select a row
 131         self.ROW_BITS = log2_int(self.BRAM_ROWS)
 132
 133         # ROW_LINE_BITS is the number of bits to select
 134         # a row within a line
 135         self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
 136
 137         # LINE_OFF_BITS is the number of bits for
 138         # the offset in a cache line
 139         self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
 140
 141         # ROW_OFF_BITS is the number of bits for
 142         # the offset in a row
 143         self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
 144
 145         # INDEX_BITS is the number if bits to
 146         # select a cache line
 147         self.INDEX_BITS = log2_int(self.NUM_LINES)
 148
 149         # SET_SIZE_BITS is the log base 2 of the set size
 150         self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
 151
 152         # TAG_BITS is the number of bits of
 153         # the tag part of the address
 154         self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
 155
 156         # TAG_WIDTH is the width in bits of each way of the tag RAM
 157         self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
 158
 159         # WAY_BITS is the number of bits to select a way
 160         self.WAY_BITS = log2_int(self.NUM_WAYS)
 161
 162         # Example of layout for 32 lines of 64 bytes:
 163         layout = f"""\
 164           DCache Layout:
 165          |.. -----------------------| REAL_ADDR_BITS ({self.REAL_ADDR_BITS})
 166           ..         |--------------| SET_SIZE_BITS ({self.SET_SIZE_BITS})
 167           ..  tag    |index|  line  |
 168           ..         |   row   |    |
 169           ..         |     |---|    | ROW_LINE_BITS ({self.ROW_LINE_BITS})
 170           ..         |     |--- - --| LINE_OFF_BITS ({self.LINE_OFF_BITS})
 171           ..         |         |- --| ROW_OFF_BITS  ({self.ROW_OFF_BITS})
 172           ..         |----- ---|    | ROW_BITS      ({self.ROW_BITS})
 173           ..         |-----|        | INDEX_BITS    ({self.INDEX_BITS})
 174           .. --------|              | TAG_BITS      ({self.TAG_BITS})
 175         """
 176         print (layout)
 177         print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 178                     (self.TAG_BITS, self.INDEX_BITS, self.ROW_BITS,
 179                      self.ROW_OFF_BITS, self.LINE_OFF_BITS, self.ROW_LINE_BITS))
 180         print ("index @: %d-%d" % (self.LINE_OFF_BITS, self.SET_SIZE_BITS))
 181         print ("row @: %d-%d" % (self.LINE_OFF_BITS, self.ROW_OFF_BITS))
 182         print ("tag @: %d-%d width %d" % (self.SET_SIZE_BITS,
 183                                           self.REAL_ADDR_BITS, self.TAG_WIDTH))
 184
 185         self.TAG_RAM_WIDTH = self.TAG_WIDTH * self.NUM_WAYS
 186
 187         print ("TAG_RAM_WIDTH", self.TAG_RAM_WIDTH)
 188         print ("    TAG_WIDTH", self.TAG_WIDTH)
 189         print ("     NUM_WAYS", self.NUM_WAYS)
 190         print ("    NUM_LINES", self.NUM_LINES)
 191
 192         # L1 TLB
 193         self.TLB_SET_BITS     = log2_int(self.TLB_SET_SIZE)
 194         self.TLB_WAY_BITS     = log2_int(self.TLB_NUM_WAYS)
 195         self.TLB_EA_TAG_BITS  = 64 - (self.TLB_LG_PGSZ + self.TLB_SET_BITS)
 196         self.TLB_TAG_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_EA_TAG_BITS
 197         self.TLB_PTE_BITS     = 64
 198         self.TLB_PTE_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_PTE_BITS;
 199
 200         assert (self.LINE_SIZE % self.ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 201         assert ispow2(self.LINE_SIZE), "LINE_SIZE not power of 2"
 202         assert ispow2(self.NUM_LINES), "NUM_LINES not power of 2"
 203         assert ispow2(self.ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 204         assert self.ROW_BITS == \
 205                 (self.INDEX_BITS + self.ROW_LINE_BITS), \
 206                 "geometry bits don't add up"
 207         assert (self.LINE_OFF_BITS == \
 208                 self.ROW_OFF_BITS + self.ROW_LINE_BITS), \
 209                 "geometry bits don't add up"
 210         assert self.REAL_ADDR_BITS == \
 211                 (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS), \
 212                 "geometry bits don't add up"
 213         assert self.REAL_ADDR_BITS == \
 214                 (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS), \
 215                  "geometry bits don't add up"
 216         assert 64 == WB_DATA_BITS, \
 217                 "Can't yet handle wb width that isn't 64-bits"
 218         assert self.SET_SIZE_BITS <= self.TLB_LG_PGSZ, \
 219                 "Set indexed by virtual address"
 220
 221     def CacheTagArray(self):
 222         return Array(Signal(self.TAG_RAM_WIDTH, name="tag%d" % x) \
 223                        for x in range(self.NUM_LINES))
 224
 225     def CacheValidsArray(self):
 226         return Array(Signal(self.NUM_WAYS, name="tag_valids%d" % x)
 227                      for x in range(self.NUM_LINES))
 228
 229     def RowPerLineValidArray(self):
 230         return Array(Signal(name="rows_valid%d" % x) \
 231                             for x in range(self.ROW_PER_LINE))
 232
 233     def TLBHit(self, name):
 234         return Record([('valid', 1),
 235                        ('way', self.TLB_WAY_BITS)], name=name)
 236
 237     def TLBTagEAArray(self):
 238         return Array(Signal(self.TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 239                     for x in range (self.TLB_NUM_WAYS))
 240
 241     def TLBRecord(self, name):
 242         tlb_layout = [('valid', self.TLB_NUM_WAYS),
 243                       ('tag', self.TLB_TAG_WAY_BITS),
 244                       ('pte', self.TLB_PTE_WAY_BITS)
 245                      ]
 246         return Record(tlb_layout, name=name)
 247
 248     def TLBValidArray(self):
 249         return Array(Signal(self.TLB_NUM_WAYS, name="tlb_valid%d" % x)
 250                             for x in range(self.TLB_SET_SIZE))
 251
 252     def HitWaySet(self):
 253         return Array(Signal(self.WAY_BITS, name="hitway_%d" % x) \
 254                             for x in range(self.TLB_NUM_WAYS))
 255
 256     # Cache RAM interface
 257     def CacheRamOut(self):
 258         return Array(Signal(self.WB_DATA_BITS, name="cache_out%d" % x) \
 259                      for x in range(self.NUM_WAYS))
 260
 261     # PLRU output interface
 262     def PLRUOut(self):
 263         return Array(Signal(self.WAY_BITS, name="plru_out%d" % x) \
 264                     for x in range(self.NUM_LINES))
 265
 266     # TLB PLRU output interface
 267     def TLBPLRUOut(self):
 268         return Array(Signal(self.TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 269                     for x in range(self.TLB_SET_SIZE))
 270
 271     # Helper functions to decode incoming requests
 272     #
 273     # Return the cache line index (tag index) for an address
 274     def get_index(self, addr):
 275         return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
 276
 277     # Return the cache row index (data memory) for an address
 278     def get_row(self, addr):
 279         return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
 280
 281     # Return the index of a row within a line
 282     def get_row_of_line(self, row):
 283         return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
 284
 285     # Returns whether this is the last row of a line
 286     def is_last_row_addr(self, addr, last):
 287         return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
 288
 289     # Returns whether this is the last row of a line
 290     def is_last_row(self, row, last):
 291         return self.get_row_of_line(row) == last
 292
 293     # Return the next row in the current cache line. We use a
 294     # dedicated function in order to limit the size of the
 295     # generated adder to be only the bits within a cache line
 296     # (3 bits with default settings)
 297     def next_row(self, row):
 298         row_v = row[0:self.ROW_LINE_BITS] + 1
 299         return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
 300
 301     # Get the tag value from the address
 302     def get_tag(self, addr):
 303         return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
 304
 305     # Read a tag from a tag memory row
 306     def read_tag(self, way, tagset):
 307         return tagset.word_select(way, self.TAG_WIDTH)[:self.TAG_BITS]
 308
 309     # Read a TLB tag from a TLB tag memory row
 310     def read_tlb_tag(self, way, tags):
 311         return tags.word_select(way, self.TLB_EA_TAG_BITS)
 312
 313     # Write a TLB tag to a TLB tag memory row
 314     def write_tlb_tag(self, way, tags, tag):
 315         return self.read_tlb_tag(way, tags).eq(tag)
 316
 317     # Read a PTE from a TLB PTE memory row
 318     def read_tlb_pte(self, way, ptes):
 319         return ptes.word_select(way, self.TLB_PTE_BITS)
 320
 321     def write_tlb_pte(self, way, ptes, newpte):
 322         return self.read_tlb_pte(way, ptes).eq(newpte)
 323
 324
 325 # Record for storing permission, attribute, etc. bits from a PTE
 326 class PermAttr(RecordObject):
 327     def __init__(self, name=None):
 328         super().__init__(name=name)
 329         self.reference = Signal()
 330         self.changed   = Signal()
 331         self.nocache   = Signal()
 332         self.priv      = Signal()
 333         self.rd_perm   = Signal()
 334         self.wr_perm   = Signal()
 335
 336
 337 def extract_perm_attr(pte):
 338     pa = PermAttr()
 339     return pa;
 340
 341
 342 # Type of operation on a "valid" input
 343 @unique
 344 class Op(Enum):
 345     OP_NONE       = 0
 346     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 347     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 348     OP_LOAD_HIT   = 3 # Cache hit on load
 349     OP_LOAD_MISS  = 4 # Load missing cache
 350     OP_LOAD_NC    = 5 # Non-cachable load
 351     OP_STORE_HIT  = 6 # Store hitting cache
 352     OP_STORE_MISS = 7 # Store missing cache
 353
 354
 355 # Cache state machine
 356 @unique
 357 class State(Enum):
 358     IDLE             = 0 # Normal load hit processing
 359     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 360     STORE_WAIT_ACK   = 2 # Store wait ack
 361     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 362
 363
 364 # Dcache operations:
 365 #
 366 # In order to make timing, we use the BRAMs with
 367 # an output buffer, which means that the BRAM
 368 # output is delayed by an extra cycle.
 369 #
 370 # Thus, the dcache has a 2-stage internal pipeline
 371 # for cache hits with no stalls.
 372 #
 373 # All other operations are handled via stalling
 374 # in the first stage.
 375 #
 376 # The second stage can thus complete a hit at the same
 377 # time as the first stage emits a stall for a complex op.
 378 #
 379 # Stage 0 register, basically contains just the latched request
 380
 381 class RegStage0(RecordObject):
 382     def __init__(self, name=None):
 383         super().__init__(name=name)
 384         self.req     = LoadStore1ToDCacheType(name="lsmem")
 385         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 386         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 387         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 388         self.mmu_req = Signal() # indicates source of request
 389         self.d_valid = Signal() # indicates req.data is valid now
 390
 391
 392 class MemAccessRequest(RecordObject):
 393     def __init__(self, cfg, name=None):
 394         super().__init__(name=name)
 395         self.op        = Signal(Op)
 396         self.valid     = Signal()
 397         self.dcbz      = Signal()
 398         self.real_addr = Signal(cfg.REAL_ADDR_BITS)
 399         self.data      = Signal(64)
 400         self.byte_sel  = Signal(8)
 401         self.hit_way   = Signal(cfg.WAY_BITS)
 402         self.same_tag  = Signal()
 403         self.mmu_req   = Signal()
 404
 405
 406 # First stage register, contains state for stage 1 of load hits
 407 # and for the state machine used by all other operations
 408 class RegStage1(RecordObject):
 409     def __init__(self, cfg, name=None):
 410         super().__init__(name=name)
 411         # Info about the request
 412         self.full             = Signal() # have uncompleted request
 413         self.mmu_req          = Signal() # request is from MMU
 414         self.req              = MemAccessRequest(cfg, name="reqmem")
 415
 416         # Cache hit state
 417         self.hit_way          = Signal(cfg.WAY_BITS)
 418         self.hit_load_valid   = Signal()
 419         self.hit_index        = Signal(cfg.INDEX_BITS)
 420         self.cache_hit        = Signal()
 421
 422         # TLB hit state
 423         self.tlb_hit          = cfg.TLBHit("tlb_hit")
 424         self.tlb_hit_index    = Signal(cfg.TLB_SET_BITS)
 425
 426         # 2-stage data buffer for data forwarded from writes to reads
 427         self.forward_data1    = Signal(64)
 428         self.forward_data2    = Signal(64)
 429         self.forward_sel1     = Signal(8)
 430         self.forward_valid1   = Signal()
 431         self.forward_way1     = Signal(cfg.WAY_BITS)
 432         self.forward_row1     = Signal(cfg.ROW_BITS)
 433         self.use_forward1     = Signal()
 434         self.forward_sel      = Signal(8)
 435
 436         # Cache miss state (reload state machine)
 437         self.state            = Signal(State)
 438         self.dcbz             = Signal()
 439         self.write_bram       = Signal()
 440         self.write_tag        = Signal()
 441         self.slow_valid       = Signal()
 442         self.wb               = WBMasterOut("wb")
 443         self.reload_tag       = Signal(cfg.TAG_BITS)
 444         self.store_way        = Signal(cfg.WAY_BITS)
 445         self.store_row        = Signal(cfg.ROW_BITS)
 446         self.store_index      = Signal(cfg.INDEX_BITS)
 447         self.end_row_ix       = Signal(cfg.ROW_LINE_BITS)
 448         self.rows_valid       = cfg.RowPerLineValidArray()
 449         self.acks_pending     = Signal(3)
 450         self.inc_acks         = Signal()
 451         self.dec_acks         = Signal()
 452
 453         # Signals to complete (possibly with error)
 454         self.ls_valid         = Signal()
 455         self.ls_error         = Signal()
 456         self.mmu_done         = Signal()
 457         self.mmu_error        = Signal()
 458         self.cache_paradox    = Signal()
 459
 460         # Signal to complete a failed stcx.
 461         self.stcx_fail        = Signal()
 462
 463
 464 # Reservation information
 465 class Reservation(RecordObject):
 466     def __init__(self, cfg, name=None):
 467         super().__init__(name=name)
 468         self.valid = Signal()
 469         self.addr  = Signal(64-cfg.LINE_OFF_BITS)
 470
 471
 472 class DTLBUpdate(Elaboratable):
 473     def __init__(self, cfg):
 474         self.cfg = cfg
 475         self.tlbie    = Signal()
 476         self.tlbwe    = Signal()
 477         self.doall    = Signal()
 478         self.tlb_hit     = cfg.TLBHit("tlb_hit")
 479         self.tlb_req_index = Signal(cfg.TLB_SET_BITS)
 480
 481         self.repl_way        = Signal(cfg.TLB_WAY_BITS)
 482         self.eatag           = Signal(cfg.TLB_EA_TAG_BITS)
 483         self.pte_data        = Signal(cfg.TLB_PTE_BITS)
 484
 485         # read from dtlb array
 486         self.tlb_read       = Signal()
 487         self.tlb_read_index = Signal(cfg.TLB_SET_BITS)
 488         self.tlb_way        = cfg.TLBRecord("o_tlb_way")
 489
 490     def elaborate(self, platform):
 491         m = Module()
 492         comb = m.d.comb
 493         sync = m.d.sync
 494         cfg = self.cfg
 495
 496         # there are 3 parts to this:
 497         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 498         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 499         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 500         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 501         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 502         # hmmm....
 503
 504         dtlb_valid = cfg.TLBValidArray()
 505         tlb_req_index = self.tlb_req_index
 506
 507         print ("TLB_TAG_WAY_BITS", cfg.TLB_TAG_WAY_BITS)
 508         print ("     TLB_EA_TAG_BITS", cfg.TLB_EA_TAG_BITS)
 509         print ("        TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
 510         print ("TLB_PTE_WAY_BITS", cfg.TLB_PTE_WAY_BITS)
 511         print ("    TLB_PTE_BITS", cfg.TLB_PTE_BITS)
 512         print ("    TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
 513
 514         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 515         tagway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_TAG_WAY_BITS)
 516         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 517         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 518                                     granularity=cfg.TLB_EA_TAG_BITS)
 519
 520         pteway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_PTE_WAY_BITS)
 521         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 522         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 523                                     granularity=cfg.TLB_PTE_BITS)
 524
 525         # commented out for now, can be put in if Memory.reset can be
 526         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 527         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 528         #m.submodules.rd_valid = rd_valid = validm.read_port()
 529         #m.submodules.wr_valid = wr_valid = validm.write_port(
 530                                     #granularity=1)
 531
 532         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 533         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 534         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 535         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 536         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 537         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 538         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 539
 540         updated  = Signal()
 541         v_updated  = Signal()
 542         tb_out = Signal(cfg.TLB_TAG_WAY_BITS) # tlb_way_tags_t
 543         db_out = Signal(cfg.TLB_NUM_WAYS)     # tlb_way_valids_t
 544         pb_out = Signal(cfg.TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 545         dv = Signal(cfg.TLB_NUM_WAYS) # tlb_way_valids_t
 546
 547         comb += dv.eq(dtlb_valid[tlb_req_index])
 548         comb += db_out.eq(dv)
 549
 550         with m.If(self.tlbie & self.doall):
 551             # clear all valid bits at once
 552             # XXX hmmm, validm _could_ use Memory reset here...
 553             for i in range(cfg.TLB_SET_SIZE):
 554                 sync += dtlb_valid[i].eq(0)
 555         with m.Elif(self.tlbie):
 556             # invalidate just the hit_way
 557             with m.If(self.tlb_hit.valid):
 558                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 559                 comb += v_updated.eq(1)
 560         with m.Elif(self.tlbwe):
 561             # write to the requested tag and PTE
 562             comb += cfg.write_tlb_tag(self.repl_way, tb_out, self.eatag)
 563             comb += cfg.write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 564             # set valid bit
 565             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 566
 567             comb += updated.eq(1)
 568             comb += v_updated.eq(1)
 569
 570         # above, sometimes valid is requested to be updated but data not
 571         # therefore split them out, here.  note the granularity thing matches
 572         # with the shift-up of the eatag/pte_data into the correct TLB way.
 573         # thus is it not necessary to write the entire lot, just the portion
 574         # being altered: hence writing the *old* copy of the row is not needed
 575         with m.If(updated): # PTE and TAG to be written
 576             comb += wr_pteway.data.eq(pb_out)
 577             comb += wr_pteway.en.eq(1<<self.repl_way)
 578             comb += wr_tagway.data.eq(tb_out)
 579             comb += wr_tagway.en.eq(1<<self.repl_way)
 580         with m.If(v_updated): # Valid to be written
 581             sync += dtlb_valid[tlb_req_index].eq(db_out)
 582             #comb += wr_valid.data.eq(db_out)
 583             #comb += wr_valid.en.eq(1<<self.repl_way)
 584
 585         # select one TLB way, use a register here
 586         r_delay = Signal()
 587         sync += r_delay.eq(self.tlb_read)
 588         # first deal with the valids, which are not in a Memory.
 589         # tlb way valid is output on a 1 clock delay with sync,
 590         # but have to explicitly deal with "forwarding" here
 591         with m.If(self.tlb_read):
 592             with m.If(v_updated): # write *and* read in same cycle: forward
 593                 sync += self.tlb_way.valid.eq(db_out)
 594             with m.Else():
 595                 sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 596         # now deal with the Memory-read case. the output must remain
 597         # valid (stable) even when a read-request is not made, but stable
 598         # on a one-clock delay, hence the register
 599         r_tlb_way        = cfg.TLBRecord("r_tlb_way")
 600         with m.If(r_delay):
 601             # on one clock delay, capture the contents of the read port(s)
 602             comb += self.tlb_way.tag.eq(rd_tagway.data)
 603             comb += self.tlb_way.pte.eq(rd_pteway.data)
 604             sync += r_tlb_way.tag.eq(rd_tagway.data)
 605             sync += r_tlb_way.pte.eq(rd_pteway.data)
 606         with m.Else():
 607             # ... so that the register can output it when no read is requested
 608             # it's rather overkill but better to be safe than sorry
 609             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 610             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 611             #comb += self.tlb_way.eq(r_tlb_way)
 612
 613         return m
 614
 615
 616 class DCachePendingHit(Elaboratable):
 617
 618     def __init__(self, cfg, tlb_way,
 619                       cache_i_validdx, cache_tag_set,
 620                     req_addr):
 621
 622         self.go          = Signal()
 623         self.virt_mode   = Signal()
 624         self.is_hit      = Signal()
 625         self.tlb_hit     = cfg.TLBHit("tlb_hit")
 626         self.hit_way     = Signal(cfg.WAY_BITS)
 627         self.rel_match   = Signal()
 628         self.req_index   = Signal(cfg.INDEX_BITS)
 629         self.reload_tag  = Signal(cfg.TAG_BITS)
 630
 631         self.tlb_way = tlb_way
 632         self.cache_i_validdx = cache_i_validdx
 633         self.cache_tag_set = cache_tag_set
 634         self.req_addr = req_addr
 635         self.cfg = cfg
 636
 637     def elaborate(self, platform):
 638         m = Module()
 639         comb = m.d.comb
 640         sync = m.d.sync
 641
 642         go = self.go
 643         virt_mode = self.virt_mode
 644         is_hit = self.is_hit
 645         tlb_way = self.tlb_way
 646         cache_i_validdx = self.cache_i_validdx
 647         cache_tag_set = self.cache_tag_set
 648         req_addr = self.req_addr
 649         tlb_hit = self.tlb_hit
 650         hit_way = self.hit_way
 651         rel_match = self.rel_match
 652         req_index = self.req_index
 653         reload_tag = self.reload_tag
 654         cfg = self.cfg
 655
 656         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 657                                   for i in range(cfg.TLB_NUM_WAYS))
 658         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 659                                     for i in range(cfg.TLB_NUM_WAYS))
 660         hit_way_set = cfg.HitWaySet()
 661
 662         # Test if pending request is a hit on any way
 663         # In order to make timing in virtual mode,
 664         # when we are using the TLB, we compare each
 665         # way with each of the real addresses from each way of
 666         # the TLB, and then decide later which match to use.
 667
 668         with m.If(virt_mode):
 669             for j in range(cfg.TLB_NUM_WAYS): # tlb_num_way_t
 670                 s_tag       = Signal(cfg.TAG_BITS, name="s_tag%d" % j)
 671                 s_hit       = Signal(name="s_hit%d" % j)
 672                 s_pte       = Signal(cfg.TLB_PTE_BITS, name="s_pte%d" % j)
 673                 s_ra        = Signal(cfg.REAL_ADDR_BITS, name="s_ra%d" % j)
 674                 # read the PTE, calc the Real Address, get tge tag
 675                 comb += s_pte.eq(cfg.read_tlb_pte(j, tlb_way.pte))
 676                 comb += s_ra.eq(Cat(req_addr[0:cfg.TLB_LG_PGSZ],
 677                                     s_pte[cfg.TLB_LG_PGSZ:cfg.REAL_ADDR_BITS]))
 678                 comb += s_tag.eq(cfg.get_tag(s_ra))
 679                 # for each way check tge tag against the cache tag set
 680                 for i in range(cfg.NUM_WAYS): # way_t
 681                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 682                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 683                                   (cfg.read_tag(i, cache_tag_set) == s_tag)
 684                                   & (tlb_way.valid[j]))
 685                     with m.If(is_tag_hit):
 686                         comb += hit_way_set[j].eq(i)
 687                         comb += s_hit.eq(1)
 688                 comb += hit_set[j].eq(s_hit)
 689                 comb += rel_matches[j].eq(s_tag == reload_tag)
 690             with m.If(tlb_hit.valid):
 691                 comb += is_hit.eq(hit_set[tlb_hit.way])
 692                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 693                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 694         with m.Else():
 695             s_tag       = Signal(cfg.TAG_BITS)
 696             comb += s_tag.eq(cfg.get_tag(req_addr))
 697             for i in range(cfg.NUM_WAYS): # way_t
 698                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 699                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 700                           (cfg.read_tag(i, cache_tag_set) == s_tag))
 701                 with m.If(is_tag_hit):
 702                     comb += hit_way.eq(i)
 703                     comb += is_hit.eq(1)
 704             with m.If(s_tag == reload_tag):
 705                 comb += rel_match.eq(1)
 706
 707         return m
 708
 709
 710 class DCache(Elaboratable, DCacheConfig):
 711     """Set associative dcache write-through
 712
 713     TODO (in no specific order):
 714     * See list in icache.vhdl
 715     * Complete load misses on the cycle when WB data comes instead of
 716       at the end of line (this requires dealing with requests coming in
 717       while not idle...)
 718     """
 719     def __init__(self, pspec=None):
 720         self.d_in      = LoadStore1ToDCacheType("d_in")
 721         self.d_out     = DCacheToLoadStore1Type("d_out")
 722
 723         self.m_in      = MMUToDCacheType("m_in")
 724         self.m_out     = DCacheToMMUType("m_out")
 725
 726         self.stall_out = Signal()
 727         self.any_stall_out = Signal()
 728         self.dreq_when_stall = Signal()
 729         self.mreq_when_stall = Signal()
 730
 731         # standard naming (wired to non-standard for compatibility)
 732         self.bus = Interface(addr_width=32,
 733                             data_width=64,
 734                             granularity=8,
 735                             features={'stall'},
 736                             #alignment=0,
 737                             name="dcache")
 738
 739         self.log_out   = Signal(20)
 740
 741         # test if microwatt compatibility is to be enabled
 742         self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
 743                                  (pspec.microwatt_compat == True))
 744
 745         if self.microwatt_compat:
 746             # reduce way sizes and num lines
 747             super().__init__(NUM_LINES = 16,
 748                               NUM_WAYS = 1,
 749                               TLB_NUM_WAYS = 1)
 750         else:
 751             super().__init__()
 752
 753     def stage_0(self, m, r0, r1, r0_full):
 754         """Latch the request in r0.req as long as we're not stalling
 755         """
 756         comb = m.d.comb
 757         sync = m.d.sync
 758         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 759
 760         r = RegStage0("stage0")
 761
 762         # TODO, this goes in unit tests and formal proofs
 763         with m.If(d_in.valid & m_in.valid):
 764             sync += Display("request collision loadstore vs MMU")
 765
 766         with m.If(m_in.valid):
 767             comb += r.req.valid.eq(1)
 768             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 769             comb += r.req.dcbz.eq(0)
 770             comb += r.req.nc.eq(0)
 771             comb += r.req.reserve.eq(0)
 772             comb += r.req.virt_mode.eq(0)
 773             comb += r.req.priv_mode.eq(1)
 774             comb += r.req.addr.eq(m_in.addr)
 775             comb += r.req.data.eq(m_in.pte)
 776             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 777             comb += r.tlbie.eq(m_in.tlbie)
 778             comb += r.doall.eq(m_in.doall)
 779             comb += r.tlbld.eq(m_in.tlbld)
 780             comb += r.mmu_req.eq(1)
 781             comb += r.d_valid.eq(1)
 782             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 783                                  m_in.addr, m_in.pte, r.req.load)
 784
 785         with m.Else():
 786             comb += r.req.eq(d_in)
 787             comb += r.req.data.eq(0)
 788             comb += r.tlbie.eq(0)
 789             comb += r.doall.eq(0)
 790             comb += r.tlbld.eq(0)
 791             comb += r.mmu_req.eq(0)
 792             comb += r.d_valid.eq(0)
 793
 794         sync += r0_full.eq(0)
 795         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 796             sync += r0.eq(r)
 797             sync += r0_full.eq(r.req.valid)
 798         with m.Elif(~r0.d_valid):
 799             # Sample data the cycle after a request comes in from loadstore1.
 800             # If another request has come in already then the data will get
 801             # put directly into req.data below.
 802             sync += r0.req.data.eq(d_in.data)
 803             sync += r0.d_valid.eq(1)
 804         with m.If(d_in.valid):
 805             m.d.sync += Display("    DCACHE req cache "
 806                                 "virt %d addr %x data %x ld %d",
 807                                  r.req.virt_mode, r.req.addr,
 808                                  r.req.data, r.req.load)
 809
 810     def tlb_read(self, m, r0_stall, tlb_way):
 811         """TLB
 812         Operates in the second cycle on the request latched in r0.req.
 813         TLB updates write the entry at the end of the second cycle.
 814         """
 815         comb = m.d.comb
 816         sync = m.d.sync
 817         m_in, d_in = self.m_in, self.d_in
 818
 819         addrbits = Signal(self.TLB_SET_BITS)
 820
 821         amin = self.TLB_LG_PGSZ
 822         amax = self.TLB_LG_PGSZ + self.TLB_SET_BITS
 823
 824         with m.If(m_in.valid):
 825             comb += addrbits.eq(m_in.addr[amin : amax])
 826         with m.Else():
 827             comb += addrbits.eq(d_in.addr[amin : amax])
 828
 829         # If we have any op and the previous op isn't finished,
 830         # then keep the same output for next cycle.
 831         d = self.dtlb_update
 832         comb += d.tlb_read_index.eq(addrbits)
 833         comb += d.tlb_read.eq(~r0_stall)
 834         comb += tlb_way.eq(d.tlb_way)
 835
 836     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 837         """Generate TLB PLRUs
 838         """
 839         comb = m.d.comb
 840         sync = m.d.sync
 841
 842         if self.TLB_NUM_WAYS == 0:
 843             return
 844
 845         # suite of PLRUs with a selection and output mechanism
 846         tlb_plrus = PLRUs(self.TLB_SET_SIZE, self.TLB_WAY_BITS)
 847         m.submodules.tlb_plrus = tlb_plrus
 848         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 849         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 850         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 851         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 852         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 853
 854     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 855                    tlb_way,
 856                    pte, tlb_hit, valid_ra, perm_attr, ra):
 857
 858         comb = m.d.comb
 859
 860         hitway = Signal(self.TLB_WAY_BITS)
 861         hit    = Signal()
 862         eatag  = Signal(self.TLB_EA_TAG_BITS)
 863
 864         self.TLB_LG_END = self.TLB_LG_PGSZ + self.TLB_SET_BITS
 865         r0_req_addr = r0.req.addr[self.TLB_LG_PGSZ : self.TLB_LG_END]
 866         comb += tlb_req_index.eq(r0_req_addr)
 867         comb += eatag.eq(r0.req.addr[self.TLB_LG_END : 64 ])
 868
 869         for i in range(self.TLB_NUM_WAYS):
 870             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 871             tlb_tag = Signal(self.TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 872             comb += tlb_tag.eq(self.read_tlb_tag(i, tlb_way.tag))
 873             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 874             with m.If(is_tag_hit):
 875                 comb += hitway.eq(i)
 876                 comb += hit.eq(1)
 877
 878         comb += tlb_hit.valid.eq(hit & r0_valid)
 879         comb += tlb_hit.way.eq(hitway)
 880
 881         with m.If(tlb_hit.valid):
 882             comb += pte.eq(self.read_tlb_pte(hitway, tlb_way.pte))
 883         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 884
 885         with m.If(r0.req.virt_mode):
 886             comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
 887                               r0.req.addr[self.ROW_OFF_BITS:self.TLB_LG_PGSZ],
 888                               pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
 889             comb += perm_attr.reference.eq(pte[8])
 890             comb += perm_attr.changed.eq(pte[7])
 891             comb += perm_attr.nocache.eq(pte[5])
 892             comb += perm_attr.priv.eq(pte[3])
 893             comb += perm_attr.rd_perm.eq(pte[2])
 894             comb += perm_attr.wr_perm.eq(pte[1])
 895         with m.Else():
 896             comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
 897                           r0.req.addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS]))
 898             comb += perm_attr.reference.eq(1)
 899             comb += perm_attr.changed.eq(1)
 900             comb += perm_attr.nocache.eq(0)
 901             comb += perm_attr.priv.eq(1)
 902             comb += perm_attr.rd_perm.eq(1)
 903             comb += perm_attr.wr_perm.eq(1)
 904
 905         with m.If(valid_ra):
 906             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 907                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 908             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 909             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 910             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 911             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 912             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 913             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 914
 915     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 916                     tlb_hit, tlb_plru_victim):
 917
 918         comb = m.d.comb
 919         sync = m.d.sync
 920
 921         tlbie    = Signal()
 922         tlbwe    = Signal()
 923
 924         comb += tlbie.eq(r0_valid & r0.tlbie)
 925         comb += tlbwe.eq(r0_valid & r0.tlbld)
 926
 927         d = self.dtlb_update
 928
 929         comb += d.tlbie.eq(tlbie)
 930         comb += d.tlbwe.eq(tlbwe)
 931         comb += d.doall.eq(r0.doall)
 932         comb += d.tlb_hit.eq(tlb_hit)
 933         comb += d.tlb_req_index.eq(tlb_req_index)
 934
 935         with m.If(tlb_hit.valid):
 936             comb += d.repl_way.eq(tlb_hit.way)
 937         with m.Else():
 938             comb += d.repl_way.eq(tlb_plru_victim)
 939         comb += d.eatag.eq(r0.req.addr[self.TLB_LG_PGSZ + self.TLB_SET_BITS:64])
 940         comb += d.pte_data.eq(r0.req.data)
 941
 942     def maybe_plrus(self, m, r1, plru_victim):
 943         """Generate PLRUs
 944         """
 945         comb = m.d.comb
 946         sync = m.d.sync
 947
 948         if self.TLB_NUM_WAYS == 0:
 949             return
 950
 951         # suite of PLRUs with a selection and output mechanism
 952         m.submodules.plrus = plrus = PLRUs(self.NUM_LINES, self.WAY_BITS)
 953         comb += plrus.way.eq(r1.hit_way)
 954         comb += plrus.valid.eq(r1.cache_hit)
 955         comb += plrus.index.eq(r1.hit_index)
 956         comb += plrus.isel.eq(r1.store_index) # select victim
 957         comb += plru_victim.eq(plrus.o_index) # selected victim
 958
 959     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set):
 960         """Cache tag RAM read port
 961         """
 962         comb = m.d.comb
 963         sync = m.d.sync
 964
 965         m_in, d_in = self.m_in, self.d_in
 966
 967         # synchronous tag read-port
 968         m.submodules.rd_tag = rd_tag = self.tagmem.read_port()
 969
 970         index = Signal(self.INDEX_BITS)
 971
 972         with m.If(r0_stall):
 973             comb += index.eq(req_index)
 974         with m.Elif(m_in.valid):
 975             comb += index.eq(self.get_index(m_in.addr))
 976         with m.Else():
 977             comb += index.eq(self.get_index(d_in.addr))
 978         comb += rd_tag.addr.eq(index)
 979         comb += cache_tag_set.eq(rd_tag.data) # read-port is a 1-clock delay
 980
 981     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 982                        r0_valid, r1, cache_valids, replace_way,
 983                        use_forward1_next, use_forward2_next,
 984                        req_hit_way, plru_victim, rc_ok, perm_attr,
 985                        valid_ra, perm_ok, access_ok, req_op, req_go,
 986                        tlb_hit, tlb_way, cache_tag_set,
 987                        cancel_store, req_same_tag, r0_stall, early_req_row):
 988         """Cache request parsing and hit detection
 989         """
 990
 991         comb = m.d.comb
 992         m_in, d_in = self.m_in, self.d_in
 993
 994         is_hit      = Signal()
 995         hit_way     = Signal(self.WAY_BITS)
 996         op          = Signal(Op)
 997         opsel       = Signal(3)
 998         go          = Signal()
 999         nc          = Signal()
1000         cache_i_validdx = Signal(self.NUM_WAYS)
1001
1002         # Extract line, row and tag from request
1003         comb += req_index.eq(self.get_index(r0.req.addr))
1004         comb += req_row.eq(self.get_row(r0.req.addr))
1005         comb += req_tag.eq(self.get_tag(ra))
1006
1007         if False: # display on comb is a bit... busy.
1008             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
1009                     r0.req.addr, ra, req_index, req_tag, req_row)
1010
1011         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
1012         comb += cache_i_validdx.eq(cache_valids[req_index])
1013
1014         m.submodules.dcache_pend = dc = DCachePendingHit(self, tlb_way,
1015                                             cache_i_validdx, cache_tag_set,
1016                                             r0.req.addr)
1017         comb += dc.tlb_hit.eq(tlb_hit)
1018         comb += dc.reload_tag.eq(r1.reload_tag)
1019         comb += dc.virt_mode.eq(r0.req.virt_mode)
1020         comb += dc.go.eq(go)
1021         comb += dc.req_index.eq(req_index)
1022
1023         comb += is_hit.eq(dc.is_hit)
1024         comb += hit_way.eq(dc.hit_way)
1025         comb += req_same_tag.eq(dc.rel_match)
1026
1027         # See if the request matches the line currently being reloaded
1028         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
1029                   (req_index == r1.store_index) & req_same_tag):
1030             # For a store, consider this a hit even if the row isn't
1031             # valid since it will be by the time we perform the store.
1032             # For a load, check the appropriate row valid bit.
1033             rrow = Signal(self.ROW_LINE_BITS)
1034             comb += rrow.eq(req_row)
1035             valid = r1.rows_valid[rrow]
1036             comb += is_hit.eq((~r0.req.load) | valid)
1037             comb += hit_way.eq(replace_way)
1038
1039         # Whether to use forwarded data for a load or not
1040         with m.If((self.get_row(r1.req.real_addr) == req_row) &
1041                   (r1.req.hit_way == hit_way)):
1042             # Only need to consider r1.write_bram here, since if we
1043             # are writing refill data here, then we don't have a
1044             # cache hit this cycle on the line being refilled.
1045             # (There is the possibility that the load following the
1046             # load miss that started the refill could be to the old
1047             # contents of the victim line, since it is a couple of
1048             # cycles after the refill starts before we see the updated
1049             # cache tag. In that case we don't use the bypass.)
1050             comb += use_forward1_next.eq(r1.write_bram)
1051         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
1052             comb += use_forward2_next.eq(r1.forward_valid1)
1053
1054         # The way that matched on a hit
1055         comb += req_hit_way.eq(hit_way)
1056
1057         # The way to replace on a miss
1058         with m.If(r1.write_tag):
1059             comb += replace_way.eq(plru_victim)
1060         with m.Else():
1061             comb += replace_way.eq(r1.store_way)
1062
1063         # work out whether we have permission for this access
1064         # NB we don't yet implement AMR, thus no KUAP
1065         comb += rc_ok.eq(perm_attr.reference
1066                          & (r0.req.load | perm_attr.changed))
1067         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1068                            (perm_attr.wr_perm |
1069                               (r0.req.load & perm_attr.rd_perm)))
1070         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1071
1072         # Combine the request and cache hit status to decide what
1073         # operation needs to be done
1074         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1075         comb += op.eq(Op.OP_NONE)
1076         with m.If(go):
1077             with m.If(~access_ok):
1078                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1079                                  valid_ra, perm_ok, rc_ok)
1080                 comb += op.eq(Op.OP_BAD)
1081             with m.Elif(cancel_store):
1082                 m.d.sync += Display("DCACHE cancel store")
1083                 comb += op.eq(Op.OP_STCX_FAIL)
1084             with m.Else():
1085                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1086                                  valid_ra, nc, r0.req.load)
1087                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1088                 with m.Switch(opsel):
1089                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1090                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1091                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1092                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1093                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1094                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1095                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1096                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1097         comb += req_op.eq(op)
1098         comb += req_go.eq(go)
1099
1100         # Version of the row number that is valid one cycle earlier
1101         # in the cases where we need to read the cache data BRAM.
1102         # If we're stalling then we need to keep reading the last
1103         # row requested.
1104         with m.If(~r0_stall):
1105             with m.If(m_in.valid):
1106                 comb += early_req_row.eq(self.get_row(m_in.addr))
1107             with m.Else():
1108                 comb += early_req_row.eq(self.get_row(d_in.addr))
1109         with m.Else():
1110             comb += early_req_row.eq(req_row)
1111
1112     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1113                          r0_valid, r0, reservation):
1114         """Handle load-with-reservation and store-conditional instructions
1115         """
1116         comb = m.d.comb
1117
1118         with m.If(r0_valid & r0.req.reserve):
1119             # XXX generate alignment interrupt if address
1120             # is not aligned XXX or if r0.req.nc = '1'
1121             with m.If(r0.req.load):
1122                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1123             with m.Else():
1124                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1125                 with m.If((~reservation.valid) |
1126                          (r0.req.addr[self.LINE_OFF_BITS:64] !=
1127                           reservation.addr)):
1128                     comb += cancel_store.eq(1)
1129
1130     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1131                         reservation, r0):
1132         comb = m.d.comb
1133         sync = m.d.sync
1134
1135         with m.If(r0_valid & access_ok):
1136             with m.If(clear_rsrv):
1137                 sync += reservation.valid.eq(0)
1138             with m.Elif(set_rsrv):
1139                 sync += reservation.valid.eq(1)
1140                 sync += reservation.addr.eq(r0.req.addr[self.LINE_OFF_BITS:64])
1141
1142     def writeback_control(self, m, r1, cache_out_row):
1143         """Return data for loads & completion control logic
1144         """
1145         comb = m.d.comb
1146         sync = m.d.sync
1147         d_out, m_out = self.d_out, self.m_out
1148
1149         data_out = Signal(64)
1150         data_fwd = Signal(64)
1151
1152         # Use the bypass if are reading the row that was
1153         # written 1 or 2 cycles ago, including for the
1154         # slow_valid = 1 case (i.e. completing a load
1155         # miss or a non-cacheable load).
1156         with m.If(r1.use_forward1):
1157             comb += data_fwd.eq(r1.forward_data1)
1158         with m.Else():
1159             comb += data_fwd.eq(r1.forward_data2)
1160
1161         comb += data_out.eq(cache_out_row)
1162
1163         for i in range(8):
1164             with m.If(r1.forward_sel[i]):
1165                 dsel = data_fwd.word_select(i, 8)
1166                 comb += data_out.word_select(i, 8).eq(dsel)
1167
1168         # DCache output to LoadStore
1169         comb += d_out.valid.eq(r1.ls_valid)
1170         comb += d_out.data.eq(data_out)
1171         comb += d_out.store_done.eq(~r1.stcx_fail)
1172         comb += d_out.error.eq(r1.ls_error)
1173         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1174
1175         # Outputs to MMU
1176         comb += m_out.done.eq(r1.mmu_done)
1177         comb += m_out.err.eq(r1.mmu_error)
1178         comb += m_out.data.eq(data_out)
1179
1180         # We have a valid load or store hit or we just completed
1181         # a slow op such as a load miss, a NC load or a store
1182         #
1183         # Note: the load hit is delayed by one cycle. However it
1184         # can still not collide with r.slow_valid (well unless I
1185         # miscalculated) because slow_valid can only be set on a
1186         # subsequent request and not on its first cycle (the state
1187         # machine must have advanced), which makes slow_valid
1188         # at least 2 cycles from the previous hit_load_valid.
1189
1190         # Sanity: Only one of these must be set in any given cycle
1191
1192         if False: # TODO: need Display to get this to work
1193             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1194             "unexpected slow_valid collision with stcx_fail"
1195
1196             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1197              "unexpected hit_load_delayed collision with slow_valid"
1198
1199         with m.If(~r1.mmu_req):
1200             # Request came from loadstore1...
1201             # Load hit case is the standard path
1202             with m.If(r1.hit_load_valid):
1203                 sync += Display("completing load hit data=%x", data_out)
1204
1205             # error cases complete without stalling
1206             with m.If(r1.ls_error):
1207                 with m.If(r1.dcbz):
1208                     sync += Display("completing dcbz with error")
1209                 with m.Else():
1210                     sync += Display("completing ld/st with error")
1211
1212             # Slow ops (load miss, NC, stores)
1213             with m.If(r1.slow_valid):
1214                 sync += Display("completing store or load miss adr=%x data=%x",
1215                                 r1.req.real_addr, data_out)
1216
1217         with m.Else():
1218             # Request came from MMU
1219             with m.If(r1.hit_load_valid):
1220                 sync += Display("completing load hit to MMU, data=%x",
1221                                 m_out.data)
1222             # error cases complete without stalling
1223             with m.If(r1.mmu_error):
1224                 sync += Display("combpleting MMU ld with error")
1225
1226             # Slow ops (i.e. load miss)
1227             with m.If(r1.slow_valid):
1228                 sync += Display("completing MMU load miss, adr=%x data=%x",
1229                                 r1.req.real_addr, m_out.data)
1230
1231     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1232         """rams
1233         Generate a cache RAM for each way. This handles the normal
1234         reads, writes from reloads and the special store-hit update
1235         path as well.
1236
1237         Note: the BRAMs have an extra read buffer, meaning the output
1238         is pipelined an extra cycle. This differs from the
1239         icache. The writeback logic needs to take that into
1240         account by using 1-cycle delayed signals for load hits.
1241         """
1242         comb = m.d.comb
1243         bus = self.bus
1244
1245         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1246         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1247         m.submodules.rams_replace_way_e = rwe = Decoder(self.NUM_WAYS)
1248         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1249                    ~r1.write_bram))
1250         comb += rwe.i.eq(replace_way)
1251
1252         m.submodules.rams_hit_way_e = hwe = Decoder(self.NUM_WAYS)
1253         comb += hwe.i.eq(r1.hit_way)
1254
1255         # this one is gated with write_bram, and replace_way_e can never be
1256         # set at the same time.  that means that do_write can OR the outputs
1257         m.submodules.rams_hit_req_way_e = hre = Decoder(self.NUM_WAYS)
1258         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1259         comb += hre.i.eq(r1.req.hit_way)
1260
1261         # common Signals
1262         do_read  = Signal()
1263         wr_addr  = Signal(self.ROW_BITS)
1264         wr_data  = Signal(WB_DATA_BITS)
1265         wr_sel   = Signal(self.ROW_SIZE)
1266         rd_addr  = Signal(self.ROW_BITS)
1267
1268         comb += do_read.eq(1) # always enable
1269         comb += rd_addr.eq(early_req_row)
1270
1271         # Write mux:
1272         #
1273         # Defaults to wishbone read responses (cache refill)
1274         #
1275         # For timing, the mux on wr_data/sel/addr is not
1276         # dependent on anything other than the current state.
1277
1278         with m.If(r1.write_bram):
1279             # Write store data to BRAM.  This happens one
1280             # cycle after the store is in r0.
1281             comb += wr_data.eq(r1.req.data)
1282             comb += wr_sel.eq(r1.req.byte_sel)
1283             comb += wr_addr.eq(self.get_row(r1.req.real_addr))
1284
1285         with m.Else():
1286             # Otherwise, we might be doing a reload or a DCBZ
1287             with m.If(r1.dcbz):
1288                 comb += wr_data.eq(0)
1289             with m.Else():
1290                 comb += wr_data.eq(bus.dat_r)
1291             comb += wr_addr.eq(r1.store_row)
1292             comb += wr_sel.eq(~0) # all 1s
1293
1294         # set up Cache Rams
1295         for i in range(self.NUM_WAYS):
1296             do_write = Signal(name="do_wr%d" % i)
1297             wr_sel_m = Signal(self.ROW_SIZE, name="wr_sel_m_%d" % i)
1298             d_out= Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1299
1300             way = CacheRam(self.ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1301             m.submodules["cacheram_%d" % i] = way
1302
1303             comb += way.rd_en.eq(do_read)
1304             comb += way.rd_addr.eq(rd_addr)
1305             comb += d_out.eq(way.rd_data_o)
1306             comb += way.wr_sel.eq(wr_sel_m)
1307             comb += way.wr_addr.eq(wr_addr)
1308             comb += way.wr_data.eq(wr_data)
1309
1310             # Cache hit reads
1311             with m.If(hwe.o[i]):
1312                 comb += cache_out_row.eq(d_out)
1313
1314             # these are mutually-exclusive via their Decoder-enablers
1315             # (note: Decoder-enable is inverted)
1316             comb += do_write.eq(hre.o[i] | rwe.o[i])
1317
1318             # Mask write selects with do_write since BRAM
1319             # doesn't have a global write-enable
1320             with m.If(do_write):
1321                 comb += wr_sel_m.eq(wr_sel)
1322
1323     # Cache hit synchronous machine for the easy case.
1324     # This handles load hits.
1325     # It also handles error cases (TLB miss, cache paradox)
1326     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1327                         req_hit_way, req_index, req_tag, access_ok,
1328                         tlb_hit, tlb_req_index):
1329         comb = m.d.comb
1330         sync = m.d.sync
1331
1332         with m.If(req_op != Op.OP_NONE):
1333             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1334                     req_op, r0.req.addr, r0.req.nc,
1335                     req_index, req_tag, req_hit_way)
1336
1337         with m.If(r0_valid):
1338             sync += r1.mmu_req.eq(r0.mmu_req)
1339
1340         # Fast path for load/store hits.
1341         # Set signals for the writeback controls.
1342         sync += r1.hit_way.eq(req_hit_way)
1343         sync += r1.hit_index.eq(req_index)
1344
1345         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1346         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1347                                 (req_op == Op.OP_STORE_HIT))
1348
1349         with m.If(req_op == Op.OP_BAD):
1350             sync += Display("Signalling ld/st error "
1351                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1352                             ~r0.mmu_req,r0.mmu_req,access_ok)
1353             sync += r1.ls_error.eq(~r0.mmu_req)
1354             sync += r1.mmu_error.eq(r0.mmu_req)
1355             sync += r1.cache_paradox.eq(access_ok)
1356         with m.Else():
1357             sync += r1.ls_error.eq(0)
1358             sync += r1.mmu_error.eq(0)
1359             sync += r1.cache_paradox.eq(0)
1360
1361         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1362
1363         # Record TLB hit information for updating TLB PLRU
1364         sync += r1.tlb_hit.eq(tlb_hit)
1365         sync += r1.tlb_hit_index.eq(tlb_req_index)
1366
1367     # Memory accesses are handled by this state machine:
1368     #
1369     #   * Cache load miss/reload (in conjunction with "rams")
1370     #   * Load hits for non-cachable forms
1371     #   * Stores (the collision case is handled in "rams")
1372     #
1373     # All wishbone requests generation is done here.
1374     # This machine operates at stage 1.
1375     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1376                     r0, replace_way,
1377                     req_hit_way, req_same_tag,
1378                     r0_valid, req_op, cache_valids, req_go, ra):
1379
1380         comb = m.d.comb
1381         sync = m.d.sync
1382         bus = self.bus
1383         d_in = self.d_in
1384
1385         m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
1386                                                     granularity=self.TAG_WIDTH)
1387
1388         req         = MemAccessRequest(self, "mreq_ds")
1389
1390         r1_next_cycle = Signal()
1391         req_row = Signal(self.ROW_BITS)
1392         req_idx = Signal(self.INDEX_BITS)
1393         req_tag = Signal(self.TAG_BITS)
1394         comb += req_idx.eq(self.get_index(req.real_addr))
1395         comb += req_row.eq(self.get_row(req.real_addr))
1396         comb += req_tag.eq(self.get_tag(req.real_addr))
1397
1398         sync += r1.use_forward1.eq(use_forward1_next)
1399         sync += r1.forward_sel.eq(0)
1400
1401         with m.If(use_forward1_next):
1402             sync += r1.forward_sel.eq(r1.req.byte_sel)
1403         with m.Elif(use_forward2_next):
1404             sync += r1.forward_sel.eq(r1.forward_sel1)
1405
1406         sync += r1.forward_data2.eq(r1.forward_data1)
1407         with m.If(r1.write_bram):
1408             sync += r1.forward_data1.eq(r1.req.data)
1409             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1410             sync += r1.forward_way1.eq(r1.req.hit_way)
1411             sync += r1.forward_row1.eq(self.get_row(r1.req.real_addr))
1412             sync += r1.forward_valid1.eq(1)
1413         with m.Else():
1414             with m.If(r1.dcbz):
1415                 sync += r1.forward_data1.eq(0)
1416             with m.Else():
1417                 sync += r1.forward_data1.eq(bus.dat_r)
1418             sync += r1.forward_sel1.eq(~0) # all 1s
1419             sync += r1.forward_way1.eq(replace_way)
1420             sync += r1.forward_row1.eq(r1.store_row)
1421             sync += r1.forward_valid1.eq(0)
1422
1423         # One cycle pulses reset
1424         sync += r1.slow_valid.eq(0)
1425         sync += r1.write_bram.eq(0)
1426         sync += r1.inc_acks.eq(0)
1427         sync += r1.dec_acks.eq(0)
1428
1429         sync += r1.ls_valid.eq(0)
1430         # complete tlbies and TLB loads in the third cycle
1431         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1432
1433         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1434             with m.If(r0.mmu_req):
1435                 sync += r1.mmu_done.eq(1)
1436             with m.Else():
1437                 sync += r1.ls_valid.eq(1)
1438
1439         with m.If(r1.write_tag):
1440             # Store new tag in selected way
1441             replace_way_onehot = Signal(self.NUM_WAYS)
1442             comb += replace_way_onehot.eq(1<<replace_way)
1443             ct = Signal(self.TAG_RAM_WIDTH)
1444             comb += ct.eq(r1.reload_tag << (replace_way*self.TAG_WIDTH))
1445             comb += wr_tag.en.eq(replace_way_onehot)
1446             comb += wr_tag.addr.eq(r1.store_index)
1447             comb += wr_tag.data.eq(ct)
1448
1449             sync += r1.store_way.eq(replace_way)
1450             sync += r1.write_tag.eq(0)
1451
1452         # Take request from r1.req if there is one there,
1453         # else from req_op, ra, etc.
1454         with m.If(r1.full):
1455             comb += req.eq(r1.req)
1456         with m.Else():
1457             comb += req.op.eq(req_op)
1458             comb += req.valid.eq(req_go)
1459             comb += req.mmu_req.eq(r0.mmu_req)
1460             comb += req.dcbz.eq(r0.req.dcbz)
1461             comb += req.real_addr.eq(ra)
1462
1463             with m.If(r0.req.dcbz):
1464                 # force data to 0 for dcbz
1465                 comb += req.data.eq(0)
1466             with m.Elif(r0.d_valid):
1467                 comb += req.data.eq(r0.req.data)
1468             with m.Else():
1469                 comb += req.data.eq(d_in.data)
1470
1471             # Select all bytes for dcbz
1472             # and for cacheable loads
1473             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1474                 comb += req.byte_sel.eq(~0) # all 1s
1475             with m.Else():
1476                 comb += req.byte_sel.eq(r0.req.byte_sel)
1477             comb += req.hit_way.eq(req_hit_way)
1478             comb += req.same_tag.eq(req_same_tag)
1479
1480             # Store the incoming request from r0,
1481             # if it is a slow request
1482             # Note that r1.full = 1 implies req_op = OP_NONE
1483             with m.If((req_op == Op.OP_LOAD_MISS)
1484                       | (req_op == Op.OP_LOAD_NC)
1485                       | (req_op == Op.OP_STORE_MISS)
1486                       | (req_op == Op.OP_STORE_HIT)):
1487                 sync += r1.req.eq(req)
1488                 sync += r1.full.eq(1)
1489                 # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
1490                 # destroy r1.req by overwriting r1.full back to zero
1491                 comb += r1_next_cycle.eq(1)
1492
1493         # Main state machine
1494         with m.Switch(r1.state):
1495
1496             with m.Case(State.IDLE):
1497                 sync += r1.wb.adr.eq(req.real_addr[self.ROW_OFF_BITS:])
1498                 sync += r1.wb.sel.eq(req.byte_sel)
1499                 sync += r1.wb.dat.eq(req.data)
1500                 sync += r1.dcbz.eq(req.dcbz)
1501
1502                 # Keep track of our index and way
1503                 # for subsequent stores.
1504                 sync += r1.store_index.eq(req_idx)
1505                 sync += r1.store_row.eq(req_row)
1506                 sync += r1.end_row_ix.eq(self.get_row_of_line(req_row)-1)
1507                 sync += r1.reload_tag.eq(req_tag)
1508                 sync += r1.req.same_tag.eq(1)
1509
1510                 with m.If(req.op == Op.OP_STORE_HIT):
1511                     sync += r1.store_way.eq(req.hit_way)
1512
1513                 #with m.If(r1.dec_acks):
1514                 #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
1515
1516                 # Reset per-row valid bits,
1517                 # ready for handling OP_LOAD_MISS
1518                 for i in range(self.ROW_PER_LINE):
1519                     sync += r1.rows_valid[i].eq(0)
1520
1521                 with m.If(req_op != Op.OP_NONE):
1522                     sync += Display("cache op %d", req.op)
1523
1524                 with m.Switch(req.op):
1525                     with m.Case(Op.OP_LOAD_HIT):
1526                         # stay in IDLE state
1527                         pass
1528
1529                     with m.Case(Op.OP_LOAD_MISS):
1530                         sync += Display("cache miss real addr: %x " \
1531                                 "idx: %x tag: %x",
1532                                 req.real_addr, req_row, req_tag)
1533
1534                         # Start the wishbone cycle
1535                         sync += r1.wb.we.eq(0)
1536                         sync += r1.wb.cyc.eq(1)
1537                         sync += r1.wb.stb.eq(1)
1538
1539                         # Track that we had one request sent
1540                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1541                         sync += r1.write_tag.eq(1)
1542
1543                     with m.Case(Op.OP_LOAD_NC):
1544                         sync += r1.wb.cyc.eq(1)
1545                         sync += r1.wb.stb.eq(1)
1546                         sync += r1.wb.we.eq(0)
1547                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1548
1549                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1550                         with m.If(~req.dcbz):
1551                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1552                             sync += r1.acks_pending.eq(1)
1553                             sync += r1.full.eq(0)
1554                             comb += r1_next_cycle.eq(0)
1555                             sync += r1.slow_valid.eq(1)
1556
1557                             with m.If(req.mmu_req):
1558                                 sync += r1.mmu_done.eq(1)
1559                             with m.Else():
1560                                 sync += r1.ls_valid.eq(1)
1561
1562                             with m.If(req.op == Op.OP_STORE_HIT):
1563                                 sync += r1.write_bram.eq(1)
1564                         with m.Else():
1565                             # dcbz is handled much like a load miss except
1566                             # that we are writing to memory instead of reading
1567                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1568
1569                             with m.If(req.op == Op.OP_STORE_MISS):
1570                                 sync += r1.write_tag.eq(1)
1571
1572                         sync += r1.wb.we.eq(1)
1573                         sync += r1.wb.cyc.eq(1)
1574                         sync += r1.wb.stb.eq(1)
1575
1576                     # OP_NONE and OP_BAD do nothing
1577                     # OP_BAD & OP_STCX_FAIL were
1578                     # handled above already
1579                     with m.Case(Op.OP_NONE):
1580                         pass
1581                     with m.Case(Op.OP_BAD):
1582                         pass
1583                     with m.Case(Op.OP_STCX_FAIL):
1584                         pass
1585
1586             with m.Case(State.RELOAD_WAIT_ACK):
1587                 ld_stbs_done = Signal()
1588                 # Requests are all sent if stb is 0
1589                 comb += ld_stbs_done.eq(~r1.wb.stb)
1590
1591                 # If we are still sending requests, was one accepted?
1592                 with m.If((~bus.stall) & r1.wb.stb):
1593                     # That was the last word?  We are done sending.
1594                     # Clear stb and set ld_stbs_done so we can handle an
1595                     # eventual last ack on the same cycle.
1596                     # sigh - reconstruct wb adr with 3 extra 0s at front
1597                     wb_adr = Cat(Const(0, self.ROW_OFF_BITS), r1.wb.adr)
1598                     with m.If(self.is_last_row_addr(wb_adr, r1.end_row_ix)):
1599                         sync += r1.wb.stb.eq(0)
1600                         comb += ld_stbs_done.eq(1)
1601
1602                     # Calculate the next row address in the current cache line
1603                     rlen = self.LINE_OFF_BITS-self.ROW_OFF_BITS
1604                     row = Signal(rlen)
1605                     comb += row.eq(r1.wb.adr)
1606                     sync += r1.wb.adr[:rlen].eq(row+1)
1607
1608                 # Incoming acks processing
1609                 sync += r1.forward_valid1.eq(bus.ack)
1610                 with m.If(bus.ack):
1611                     srow = Signal(self.ROW_LINE_BITS)
1612                     comb += srow.eq(r1.store_row)
1613                     sync += r1.rows_valid[srow].eq(1)
1614
1615                     # If this is the data we were looking for,
1616                     # we can complete the request next cycle.
1617                     # Compare the whole address in case the
1618                     # request in r1.req is not the one that
1619                     # started this refill.
1620                     with m.If(r1.full & r1.req.same_tag &
1621                               ((r1.dcbz & req.dcbz) |
1622                                (r1.req.op == Op.OP_LOAD_MISS)) &
1623                                 (r1.store_row ==
1624                                  self.get_row(r1.req.real_addr))):
1625                         sync += r1.full.eq(r1_next_cycle)
1626                         sync += r1.slow_valid.eq(1)
1627                         with m.If(r1.mmu_req):
1628                             sync += r1.mmu_done.eq(1)
1629                         with m.Else():
1630                             sync += r1.ls_valid.eq(1)
1631                         sync += r1.forward_sel.eq(~0) # all 1s
1632                         sync += r1.use_forward1.eq(1)
1633
1634                     # Check for completion
1635                     with m.If(ld_stbs_done & self.is_last_row(r1.store_row,
1636                                                       r1.end_row_ix)):
1637                         # Complete wishbone cycle
1638                         sync += r1.wb.cyc.eq(0)
1639
1640                         # Cache line is now valid
1641                         cv = Signal(self.INDEX_BITS)
1642                         comb += cv.eq(cache_valids[r1.store_index])
1643                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1644                         sync += cache_valids[r1.store_index].eq(cv)
1645
1646                         sync += r1.state.eq(State.IDLE)
1647                         sync += Display("cache valid set %x "
1648                                         "idx %d way %d",
1649                                          cv, r1.store_index, r1.store_way)
1650
1651                     # Increment store row counter
1652                     sync += r1.store_row.eq(self.next_row(r1.store_row))
1653
1654             with m.Case(State.STORE_WAIT_ACK):
1655                 st_stbs_done = Signal()
1656                 adjust_acks = Signal(3)
1657
1658                 comb += st_stbs_done.eq(~r1.wb.stb)
1659
1660                 with m.If(r1.inc_acks != r1.dec_acks):
1661                     with m.If(r1.inc_acks):
1662                         comb += adjust_acks.eq(r1.acks_pending + 1)
1663                     with m.Else():
1664                         comb += adjust_acks.eq(r1.acks_pending - 1)
1665                 with m.Else():
1666                     comb += adjust_acks.eq(r1.acks_pending)
1667
1668                 sync += r1.acks_pending.eq(adjust_acks)
1669
1670                 # Clear stb when slave accepted request
1671                 with m.If(~bus.stall):
1672                     # See if there is another store waiting
1673                     # to be done which is in the same real page.
1674                     # (this is when same_tsg is true)
1675                     with m.If(req.valid):
1676                         _ra = req.real_addr[self.ROW_OFF_BITS:
1677                                             self.SET_SIZE_BITS]
1678                         alen = self.SET_SIZE_BITS-self.ROW_OFF_BITS
1679                         sync += r1.wb.adr[0:alen].eq(_ra)
1680                         sync += r1.wb.dat.eq(req.data)
1681                         sync += r1.wb.sel.eq(req.byte_sel)
1682
1683                     with m.If((adjust_acks < 7) & req.same_tag &
1684                                 ((req.op == Op.OP_STORE_MISS) |
1685                                  (req.op == Op.OP_STORE_HIT))):
1686                         sync += r1.wb.stb.eq(1)
1687                         comb += st_stbs_done.eq(0)
1688                         sync += r1.store_way.eq(req.hit_way)
1689                         sync += r1.store_row.eq(self.get_row(req.real_addr))
1690
1691                         with m.If(req.op == Op.OP_STORE_HIT):
1692                             sync += r1.write_bram.eq(1)
1693                         sync += r1.full.eq(r1_next_cycle)
1694                         sync += r1.slow_valid.eq(1)
1695
1696                         # Store requests never come from the MMU
1697                         sync += r1.ls_valid.eq(1)
1698                         comb += st_stbs_done.eq(0)
1699                         sync += r1.inc_acks.eq(1)
1700                     with m.Else():
1701                         sync += r1.wb.stb.eq(0)
1702                         comb += st_stbs_done.eq(1)
1703
1704                 # Got ack ? See if complete.
1705                 sync += Display("got ack %d %d stbs %d adjust_acks %d",
1706                                 bus.ack, bus.ack, st_stbs_done, adjust_acks)
1707                 with m.If(bus.ack):
1708                     with m.If(st_stbs_done & (adjust_acks == 1)):
1709                         sync += r1.state.eq(State.IDLE)
1710                         sync += r1.wb.cyc.eq(0)
1711                         sync += r1.wb.stb.eq(0)
1712                     sync += r1.dec_acks.eq(1)
1713
1714             with m.Case(State.NC_LOAD_WAIT_ACK):
1715                 # Clear stb when slave accepted request
1716                 with m.If(~bus.stall):
1717                     sync += r1.wb.stb.eq(0)
1718
1719                 # Got ack ? complete.
1720                 with m.If(bus.ack):
1721                     sync += r1.state.eq(State.IDLE)
1722                     sync += r1.full.eq(r1_next_cycle)
1723                     sync += r1.slow_valid.eq(1)
1724
1725                     with m.If(r1.mmu_req):
1726                         sync += r1.mmu_done.eq(1)
1727                     with m.Else():
1728                         sync += r1.ls_valid.eq(1)
1729
1730                     sync += r1.forward_sel.eq(~0) # all 1s
1731                     sync += r1.use_forward1.eq(1)
1732                     sync += r1.wb.cyc.eq(0)
1733                     sync += r1.wb.stb.eq(0)
1734
1735     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1736
1737         sync = m.d.sync
1738         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1739
1740         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1741                                stall_out, req_op[:3], d_out.valid, d_out.error,
1742                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1743                                r1.real_adr[3:6]))
1744
1745     def elaborate(self, platform):
1746
1747         m = Module()
1748         comb, sync = m.d.comb, m.d.sync
1749         m_in, d_in = self.m_in, self.d_in
1750
1751         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1752         cache_valids     = self.CacheValidsArray()
1753         cache_tag_set    = Signal(self.TAG_RAM_WIDTH)
1754
1755         self.tagmem = Memory(depth=self.NUM_LINES, width=self.TAG_RAM_WIDTH)
1756
1757         """note: these are passed to nmigen.hdl.Memory as "attributes".
1758            don't know how, just that they are.
1759         """
1760         # TODO attribute ram_style of
1761         #  dtlb_tags : signal is "distributed";
1762         # TODO attribute ram_style of
1763         #  dtlb_ptes : signal is "distributed";
1764
1765         r0      = RegStage0("r0")
1766         r0_full = Signal()
1767
1768         r1 = RegStage1(self, "r1")
1769
1770         reservation = Reservation(self, "rsrv")
1771
1772         # Async signals on incoming request
1773         req_index    = Signal(self.INDEX_BITS)
1774         req_row      = Signal(self.ROW_BITS)
1775         req_hit_way  = Signal(self.WAY_BITS)
1776         req_tag      = Signal(self.TAG_BITS)
1777         req_op       = Signal(Op)
1778         req_data     = Signal(64)
1779         req_same_tag = Signal()
1780         req_go       = Signal()
1781
1782         early_req_row     = Signal(self.ROW_BITS)
1783
1784         cancel_store      = Signal()
1785         set_rsrv          = Signal()
1786         clear_rsrv        = Signal()
1787
1788         r0_valid          = Signal()
1789         r0_stall          = Signal()
1790
1791         use_forward1_next = Signal()
1792         use_forward2_next = Signal()
1793
1794         cache_out_row     = Signal(WB_DATA_BITS)
1795
1796         plru_victim       = Signal(self.WAY_BITS)
1797         replace_way       = Signal(self.WAY_BITS)
1798
1799         # Wishbone read/write/cache write formatting signals
1800         bus_sel           = Signal(8)
1801
1802         # TLB signals
1803         tlb_way       = self.TLBRecord("tlb_way")
1804         tlb_req_index = Signal(self.TLB_SET_BITS)
1805         tlb_hit       = self.TLBHit("tlb_hit")
1806         pte           = Signal(self.TLB_PTE_BITS)
1807         ra            = Signal(self.REAL_ADDR_BITS)
1808         valid_ra      = Signal()
1809         perm_attr     = PermAttr("dc_perms")
1810         rc_ok         = Signal()
1811         perm_ok       = Signal()
1812         access_ok     = Signal()
1813
1814         tlb_plru_victim = Signal(self.TLB_WAY_BITS)
1815
1816         # we don't yet handle collisions between loadstore1 requests
1817         # and MMU requests
1818         comb += self.m_out.stall.eq(0)
1819
1820         # Hold off the request in r0 when r1 has an uncompleted request
1821         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1822         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1823         comb += self.stall_out.eq(r0_stall)
1824         # debugging: detect if any stall ever requested, which is fine,
1825         # but if a request comes in when stall requested, that's bad.
1826         with m.If(r0_stall):
1827             sync += self.any_stall_out.eq(1)
1828             with m.If(d_in.valid):
1829                 sync += self.dreq_when_stall.eq(1)
1830             with m.If(m_in.valid):
1831                 sync += self.mreq_when_stall.eq(1)
1832
1833         # deal with litex not doing wishbone pipeline mode
1834         # XXX in wrong way.  FIFOs are needed in the SRAM test
1835         # so that stb/ack match up. same thing done in icache.py
1836         if not self.microwatt_compat:
1837             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1838
1839         # Wire up wishbone request latch out of stage 1
1840         comb += self.bus.we.eq(r1.wb.we)
1841         comb += self.bus.adr.eq(r1.wb.adr)
1842         comb += self.bus.sel.eq(r1.wb.sel)
1843         comb += self.bus.stb.eq(r1.wb.stb)
1844         comb += self.bus.dat_w.eq(r1.wb.dat)
1845         comb += self.bus.cyc.eq(r1.wb.cyc)
1846
1847         # create submodule TLBUpdate
1848         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate(self)
1849
1850         # call sub-functions putting everything together, using shared
1851         # signals established above
1852         self.stage_0(m, r0, r1, r0_full)
1853         self.tlb_read(m, r0_stall, tlb_way)
1854         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1855                         tlb_way,
1856                         pte, tlb_hit, valid_ra, perm_attr, ra)
1857         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1858                         tlb_hit, tlb_plru_victim)
1859         self.maybe_plrus(m, r1, plru_victim)
1860         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1861         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set)
1862         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1863                            r0_valid, r1, cache_valids, replace_way,
1864                            use_forward1_next, use_forward2_next,
1865                            req_hit_way, plru_victim, rc_ok, perm_attr,
1866                            valid_ra, perm_ok, access_ok, req_op, req_go,
1867                            tlb_hit, tlb_way, cache_tag_set,
1868                            cancel_store, req_same_tag, r0_stall, early_req_row)
1869         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1870                            r0_valid, r0, reservation)
1871         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1872                            reservation, r0)
1873         self.writeback_control(m, r1, cache_out_row)
1874         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1875         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1876                         req_hit_way, req_index, req_tag, access_ok,
1877                         tlb_hit, tlb_req_index)
1878         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1879                     r0, replace_way,
1880                     req_hit_way, req_same_tag,
1881                          r0_valid, req_op, cache_valids, req_go, ra)
1882         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1883
1884         return m
1885
1886
1887 if __name__ == '__main__':
1888     dut = DCache()
1889     vl = rtlil.convert(dut, ports=[])
1890     with open("test_dcache.il", "w") as f:
1891         f.write(vl)