src/soc/experiment/dcache.py

   1 #!/usr/bin/env python3
   2 #
   3 # Copyright (C) 2020,2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
   4 # Copyright (C) 2020 Cole Poirier
   5 # Copyright (C) 2020,2021 Cesar Strauss
   6 # Copyright (C) 2021 Tobias Platen
   7 #
   8 # Original dcache.vhdl Copyright of its authors and licensed
   9 # by IBM under CC-BY 4.0
  10 # https://github.com/antonblanchard/microwatt
  11 #
  12 # Conversion to nmigen funded by NLnet and NGI POINTER under EU Grants
  13 # 871528 and 957073, under the LGPL-v3+ License
  14
  15 """DCache
  16
  17 based on Anton Blanchard microwatt dcache.vhdl
  18
  19 note that the microwatt dcache wishbone interface expects "stall".
  20 for simplicity at the moment this is hard-coded to cyc & ~ack.
  21 see WB4 spec, p84, section 5.2.1
  22
  23 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  24 is raised.  sigh
  25
  26 Links:
  27
  28 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  29 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  30 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  31   (discussion about brams for ECP5)
  32
  33 """
  34
  35 import sys
  36
  37 from nmutil.gtkw import write_gtkw
  38
  39 sys.setrecursionlimit(1000000)
  40
  41 from enum import Enum, unique
  42
  43 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  44                     Record, Memory)
  45 from nmutil.util import Display
  46 from nmigen.lib.coding import Decoder
  47
  48 from copy import deepcopy
  49 from random import randint, seed
  50
  51 from nmigen_soc.wishbone.bus import Interface
  52
  53 from nmigen.cli import main
  54 from nmutil.iocontrol import RecordObject
  55 from nmigen.utils import log2_int
  56 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  57                                      DCacheToLoadStore1Type,
  58                                      MMUToDCacheType,
  59                                      DCacheToMMUType)
  60
  61 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  62                                 WBAddrType, WBDataType, WBSelType,
  63                                 WBMasterOut, WBSlaveOut,
  64                                 WBMasterOutVector, WBSlaveOutVector,
  65                                 WBIOMasterOut, WBIOSlaveOut)
  66
  67 from soc.experiment.cache_ram import CacheRam
  68 from soc.experiment.plru import PLRU, PLRUs
  69 #from nmutil.plru import PLRU, PLRUs
  70
  71 # for test
  72 from soc.bus.sram import SRAM
  73 from nmigen import Memory
  74 from nmigen.cli import rtlil
  75
  76 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  77 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  78 from nmutil.sim_tmp_alternative import Simulator
  79
  80 from nmutil.util import wrap
  81
  82 LOG_LENGTH = 0    # Non-zero to enable log data collection
  83
  84 def ispow2(x):
  85     return (1<<log2_int(x, False)) == x
  86
  87
  88 class DCacheConfig:
  89     def __init__(self, LINE_SIZE = 64,    # Line size in bytes
  90                        NUM_LINES = 64,    # Number of lines in a set
  91                        NUM_WAYS = 2,      # Number of ways
  92                        TLB_SET_SIZE = 64, # L1 DTLB entries per set
  93                        TLB_NUM_WAYS = 2,  # L1 DTLB number of sets
  94                        TLB_LG_PGSZ = 12): # L1 DTLB log_2(page_size)
  95         self.LINE_SIZE = LINE_SIZE
  96         self.NUM_LINES = NUM_LINES
  97         self.NUM_WAYS = NUM_WAYS
  98         self.TLB_SET_SIZE = TLB_SET_SIZE
  99         self.TLB_NUM_WAYS = TLB_NUM_WAYS
 100         self.TLB_LG_PGSZ = TLB_LG_PGSZ
 101
 102         # BRAM organisation: We never access more than
 103         #     -- WB_DATA_BITS at a time so to save
 104         #     -- resources we make the array only that wide, and
 105         #     -- use consecutive indices to make a cache "line"
 106         #     --
 107         #     -- ROW_SIZE is the width in bytes of the BRAM
 108         #     -- (based on WB, so 64-bits)
 109         self.ROW_SIZE = WB_DATA_BITS // 8;
 110
 111         # ROW_PER_LINE is the number of row (wishbone
 112         # transactions) in a line
 113         self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
 114
 115         # BRAM_ROWS is the number of rows in BRAM needed
 116         # to represent the full dcache
 117         self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE
 118
 119         print ("ROW_SIZE", self.ROW_SIZE)
 120         print ("ROW_PER_LINE", self.ROW_PER_LINE)
 121         print ("BRAM_ROWS", self.BRAM_ROWS)
 122         print ("NUM_WAYS", self.NUM_WAYS)
 123
 124         # Bit fields counts in the address
 125
 126         # REAL_ADDR_BITS is the number of real address
 127         # bits that we store
 128         self.REAL_ADDR_BITS = 56
 129
 130         # ROW_BITS is the number of bits to select a row
 131         self.ROW_BITS = log2_int(self.BRAM_ROWS)
 132
 133         # ROW_LINE_BITS is the number of bits to select
 134         # a row within a line
 135         self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
 136
 137         # LINE_OFF_BITS is the number of bits for
 138         # the offset in a cache line
 139         self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
 140
 141         # ROW_OFF_BITS is the number of bits for
 142         # the offset in a row
 143         self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
 144
 145         # INDEX_BITS is the number if bits to
 146         # select a cache line
 147         self.INDEX_BITS = log2_int(self.NUM_LINES)
 148
 149         # SET_SIZE_BITS is the log base 2 of the set size
 150         self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
 151
 152         # TAG_BITS is the number of bits of
 153         # the tag part of the address
 154         self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
 155
 156         # TAG_WIDTH is the width in bits of each way of the tag RAM
 157         self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
 158
 159         # WAY_BITS is the number of bits to select a way
 160         self.WAY_BITS = log2_int(self.NUM_WAYS)
 161
 162         # Example of layout for 32 lines of 64 bytes:
 163         layout = f"""\
 164           DCache Layout:
 165          |.. -----------------------| REAL_ADDR_BITS ({self.REAL_ADDR_BITS})
 166           ..         |--------------| SET_SIZE_BITS ({self.SET_SIZE_BITS})
 167           ..  tag    |index|  line  |
 168           ..         |   row   |    |
 169           ..         |     |---|    | ROW_LINE_BITS ({self.ROW_LINE_BITS})
 170           ..         |     |--- - --| LINE_OFF_BITS ({self.LINE_OFF_BITS})
 171           ..         |         |- --| ROW_OFF_BITS  ({self.ROW_OFF_BITS})
 172           ..         |----- ---|    | ROW_BITS      ({self.ROW_BITS})
 173           ..         |-----|        | INDEX_BITS    ({self.INDEX_BITS})
 174           .. --------|              | TAG_BITS      ({self.TAG_BITS})
 175         """
 176         print (layout)
 177         print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 178                     (self.TAG_BITS, self.INDEX_BITS, self.ROW_BITS,
 179                      self.ROW_OFF_BITS, self.LINE_OFF_BITS, self.ROW_LINE_BITS))
 180         print ("index @: %d-%d" % (self.LINE_OFF_BITS, self.SET_SIZE_BITS))
 181         print ("row @: %d-%d" % (self.LINE_OFF_BITS, self.ROW_OFF_BITS))
 182         print ("tag @: %d-%d width %d" % (self.SET_SIZE_BITS,
 183                                           self.REAL_ADDR_BITS, self.TAG_WIDTH))
 184
 185         self.TAG_RAM_WIDTH = self.TAG_WIDTH * self.NUM_WAYS
 186
 187         print ("TAG_RAM_WIDTH", self.TAG_RAM_WIDTH)
 188         print ("    TAG_WIDTH", self.TAG_WIDTH)
 189         print ("     NUM_WAYS", self.NUM_WAYS)
 190         print ("    NUM_LINES", self.NUM_LINES)
 191
 192         # L1 TLB
 193         self.TLB_SET_BITS     = log2_int(self.TLB_SET_SIZE)
 194         self.TLB_WAY_BITS     = log2_int(self.TLB_NUM_WAYS)
 195         self.TLB_EA_TAG_BITS  = 64 - (self.TLB_LG_PGSZ + self.TLB_SET_BITS)
 196         self.TLB_TAG_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_EA_TAG_BITS
 197         self.TLB_PTE_BITS     = 64
 198         self.TLB_PTE_WAY_BITS = self.TLB_NUM_WAYS * self.TLB_PTE_BITS;
 199
 200         assert (self.LINE_SIZE % self.ROW_SIZE) == 0, \
 201                 "LINE_SIZE not multiple of ROW_SIZE"
 202         assert ispow2(self.LINE_SIZE), "LINE_SIZE not power of 2"
 203         assert ispow2(self.NUM_LINES), "NUM_LINES not power of 2"
 204         assert ispow2(self.ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 205         assert self.ROW_BITS == \
 206                 (self.INDEX_BITS + self.ROW_LINE_BITS), \
 207                 "geometry bits don't add up"
 208         assert (self.LINE_OFF_BITS == \
 209                 self.ROW_OFF_BITS + self.ROW_LINE_BITS), \
 210                 "geometry bits don't add up"
 211         assert self.REAL_ADDR_BITS == \
 212                 (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS), \
 213                 "geometry bits don't add up"
 214         assert self.REAL_ADDR_BITS == \
 215                 (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS), \
 216                  "geometry bits don't add up"
 217         assert 64 == WB_DATA_BITS, \
 218                 "Can't yet handle wb width that isn't 64-bits"
 219         assert self.SET_SIZE_BITS <= self.TLB_LG_PGSZ, \
 220                 "Set indexed by virtual address"
 221
 222     def CacheTagArray(self):
 223         return Array(Signal(self.TAG_RAM_WIDTH, name="tag%d" % x) \
 224                        for x in range(self.NUM_LINES))
 225
 226     def CacheValidsArray(self):
 227         return Array(Signal(self.NUM_WAYS, name="tag_valids%d" % x)
 228                      for x in range(self.NUM_LINES))
 229
 230     def RowPerLineValidArray(self):
 231         return Array(Signal(name="rows_valid%d" % x) \
 232                             for x in range(self.ROW_PER_LINE))
 233
 234     def TLBHit(self, name):
 235         return Record([('valid', 1),
 236                        ('way', self.TLB_WAY_BITS)], name=name)
 237
 238     def TLBTagEAArray(self):
 239         return Array(Signal(self.TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 240                     for x in range (self.TLB_NUM_WAYS))
 241
 242     def TLBRecord(self, name):
 243         tlb_layout = [('valid', self.TLB_NUM_WAYS),
 244                       ('tag', self.TLB_TAG_WAY_BITS),
 245                       ('pte', self.TLB_PTE_WAY_BITS)
 246                      ]
 247         return Record(tlb_layout, name=name)
 248
 249     def TLBValidArray(self):
 250         return Array(Signal(self.TLB_NUM_WAYS, name="tlb_valid%d" % x)
 251                             for x in range(self.TLB_SET_SIZE))
 252
 253     def HitWaySet(self):
 254         return Array(Signal(self.WAY_BITS, name="hitway_%d" % x) \
 255                             for x in range(self.TLB_NUM_WAYS))
 256
 257     # Cache RAM interface
 258     def CacheRamOut(self):
 259         return Array(Signal(self.WB_DATA_BITS, name="cache_out%d" % x) \
 260                      for x in range(self.NUM_WAYS))
 261
 262     # PLRU output interface
 263     def PLRUOut(self):
 264         return Array(Signal(self.WAY_BITS, name="plru_out%d" % x) \
 265                     for x in range(self.NUM_LINES))
 266
 267     # TLB PLRU output interface
 268     def TLBPLRUOut(self):
 269         return Array(Signal(self.TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 270                     for x in range(self.TLB_SET_SIZE))
 271
 272     # Helper functions to decode incoming requests
 273     #
 274     # Return the cache line index (tag index) for an address
 275     def get_index(self, addr):
 276         return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
 277
 278     # Return the cache row index (data memory) for an address
 279     def get_row(self, addr):
 280         return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
 281
 282     # Return the index of a row within a line
 283     def get_row_of_line(self, row):
 284         return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
 285
 286     # Returns whether this is the last row of a line
 287     def is_last_row_addr(self, addr, last):
 288         return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
 289
 290     # Returns whether this is the last row of a line
 291     def is_last_row(self, row, last):
 292         return self.get_row_of_line(row) == last
 293
 294     # Return the next row in the current cache line. We use a
 295     # dedicated function in order to limit the size of the
 296     # generated adder to be only the bits within a cache line
 297     # (3 bits with default settings)
 298     def next_row(self, row):
 299         row_v = row[0:self.ROW_LINE_BITS] + 1
 300         return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
 301
 302     # Get the tag value from the address
 303     def get_tag(self, addr):
 304         return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
 305
 306     # Read a tag from a tag memory row
 307     def read_tag(self, way, tagset):
 308         return tagset.word_select(way, self.TAG_WIDTH)[:self.TAG_BITS]
 309
 310     # Read a TLB tag from a TLB tag memory row
 311     def read_tlb_tag(self, way, tags):
 312         return tags.word_select(way, self.TLB_EA_TAG_BITS)
 313
 314     # Write a TLB tag to a TLB tag memory row
 315     def write_tlb_tag(self, way, tags, tag):
 316         return self.read_tlb_tag(way, tags).eq(tag)
 317
 318     # Read a PTE from a TLB PTE memory row
 319     def read_tlb_pte(self, way, ptes):
 320         return ptes.word_select(way, self.TLB_PTE_BITS)
 321
 322     def write_tlb_pte(self, way, ptes, newpte):
 323         return self.read_tlb_pte(way, ptes).eq(newpte)
 324
 325
 326 # Record for storing permission, attribute, etc. bits from a PTE
 327 class PermAttr(RecordObject):
 328     def __init__(self, name=None):
 329         super().__init__(name=name)
 330         self.reference = Signal()
 331         self.changed   = Signal()
 332         self.nocache   = Signal()
 333         self.priv      = Signal()
 334         self.rd_perm   = Signal()
 335         self.wr_perm   = Signal()
 336
 337
 338 def extract_perm_attr(pte):
 339     pa = PermAttr()
 340     return pa;
 341
 342
 343 # Type of operation on a "valid" input
 344 @unique
 345 class Op(Enum):
 346     OP_NONE       = 0
 347     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 348     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 349     OP_LOAD_HIT   = 3 # Cache hit on load
 350     OP_LOAD_MISS  = 4 # Load missing cache
 351     OP_LOAD_NC    = 5 # Non-cachable load
 352     OP_STORE_HIT  = 6 # Store hitting cache
 353     OP_STORE_MISS = 7 # Store missing cache
 354
 355
 356 # Cache state machine
 357 @unique
 358 class State(Enum):
 359     IDLE             = 0 # Normal load hit processing
 360     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 361     STORE_WAIT_ACK   = 2 # Store wait ack
 362     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 363
 364
 365 # Dcache operations:
 366 #
 367 # In order to make timing, we use the BRAMs with
 368 # an output buffer, which means that the BRAM
 369 # output is delayed by an extra cycle.
 370 #
 371 # Thus, the dcache has a 2-stage internal pipeline
 372 # for cache hits with no stalls.
 373 #
 374 # All other operations are handled via stalling
 375 # in the first stage.
 376 #
 377 # The second stage can thus complete a hit at the same
 378 # time as the first stage emits a stall for a complex op.
 379 #
 380 # Stage 0 register, basically contains just the latched request
 381
 382 class RegStage0(RecordObject):
 383     def __init__(self, name=None):
 384         super().__init__(name=name)
 385         self.req     = LoadStore1ToDCacheType(name="lsmem")
 386         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 387         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 388         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 389         self.mmu_req = Signal() # indicates source of request
 390         self.d_valid = Signal() # indicates req.data is valid now
 391
 392
 393 class MemAccessRequest(RecordObject):
 394     def __init__(self, cfg, name=None):
 395         super().__init__(name=name)
 396         self.op        = Signal(Op)
 397         self.valid     = Signal()
 398         self.dcbz      = Signal()
 399         self.real_addr = Signal(cfg.REAL_ADDR_BITS)
 400         self.data      = Signal(64)
 401         self.byte_sel  = Signal(8)
 402         self.hit_way   = Signal(cfg.WAY_BITS)
 403         self.same_tag  = Signal()
 404         self.mmu_req   = Signal()
 405
 406
 407 # First stage register, contains state for stage 1 of load hits
 408 # and for the state machine used by all other operations
 409 class RegStage1(RecordObject):
 410     def __init__(self, cfg, name=None):
 411         super().__init__(name=name)
 412         # Info about the request
 413         self.full             = Signal() # have uncompleted request
 414         self.mmu_req          = Signal() # request is from MMU
 415         self.req              = MemAccessRequest(cfg, name="reqmem")
 416
 417         # Cache hit state
 418         self.hit_way          = Signal(cfg.WAY_BITS)
 419         self.hit_load_valid   = Signal()
 420         self.hit_index        = Signal(cfg.INDEX_BITS)
 421         self.cache_hit        = Signal()
 422
 423         # TLB hit state
 424         self.tlb_hit          = cfg.TLBHit("tlb_hit")
 425         self.tlb_hit_index    = Signal(cfg.TLB_SET_BITS)
 426
 427         # 2-stage data buffer for data forwarded from writes to reads
 428         self.forward_data1    = Signal(64)
 429         self.forward_data2    = Signal(64)
 430         self.forward_sel1     = Signal(8)
 431         self.forward_valid1   = Signal()
 432         self.forward_way1     = Signal(cfg.WAY_BITS)
 433         self.forward_row1     = Signal(cfg.ROW_BITS)
 434         self.use_forward1     = Signal()
 435         self.forward_sel      = Signal(8)
 436
 437         # Cache miss state (reload state machine)
 438         self.state            = Signal(State)
 439         self.dcbz             = Signal()
 440         self.write_bram       = Signal()
 441         self.write_tag        = Signal()
 442         self.slow_valid       = Signal()
 443         self.wb               = WBMasterOut("wb")
 444         self.reload_tag       = Signal(cfg.TAG_BITS)
 445         self.store_way        = Signal(cfg.WAY_BITS)
 446         self.store_row        = Signal(cfg.ROW_BITS)
 447         self.store_index      = Signal(cfg.INDEX_BITS)
 448         self.end_row_ix       = Signal(cfg.ROW_LINE_BITS)
 449         self.rows_valid       = cfg.RowPerLineValidArray()
 450         self.acks_pending     = Signal(3)
 451         self.inc_acks         = Signal()
 452         self.dec_acks         = Signal()
 453
 454         # Signals to complete (possibly with error)
 455         self.ls_valid         = Signal()
 456         self.ls_error         = Signal()
 457         self.mmu_done         = Signal()
 458         self.mmu_error        = Signal()
 459         self.cache_paradox    = Signal()
 460
 461         # Signal to complete a failed stcx.
 462         self.stcx_fail        = Signal()
 463
 464
 465 # Reservation information
 466 class Reservation(RecordObject):
 467     def __init__(self, cfg, name=None):
 468         super().__init__(name=name)
 469         self.valid = Signal()
 470         self.addr  = Signal(64-cfg.LINE_OFF_BITS)
 471
 472
 473 class DTLBUpdate(Elaboratable):
 474     def __init__(self, cfg):
 475         self.cfg = cfg
 476         self.tlbie    = Signal()
 477         self.tlbwe    = Signal()
 478         self.doall    = Signal()
 479         self.tlb_hit     = cfg.TLBHit("tlb_hit")
 480         self.tlb_req_index = Signal(cfg.TLB_SET_BITS)
 481
 482         self.repl_way        = Signal(cfg.TLB_WAY_BITS)
 483         self.eatag           = Signal(cfg.TLB_EA_TAG_BITS)
 484         self.pte_data        = Signal(cfg.TLB_PTE_BITS)
 485
 486         # read from dtlb array
 487         self.tlb_read       = Signal()
 488         self.tlb_read_index = Signal(cfg.TLB_SET_BITS)
 489         self.tlb_way        = cfg.TLBRecord("o_tlb_way")
 490
 491     def elaborate(self, platform):
 492         m = Module()
 493         comb = m.d.comb
 494         sync = m.d.sync
 495         cfg = self.cfg
 496
 497         # there are 3 parts to this:
 498         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 499         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 500         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 501         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 502         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 503         # hmmm....
 504
 505         dtlb_valid = cfg.TLBValidArray()
 506         tlb_req_index = self.tlb_req_index
 507
 508         print ("TLB_TAG_WAY_BITS", cfg.TLB_TAG_WAY_BITS)
 509         print ("     TLB_EA_TAG_BITS", cfg.TLB_EA_TAG_BITS)
 510         print ("        TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
 511         print ("TLB_PTE_WAY_BITS", cfg.TLB_PTE_WAY_BITS)
 512         print ("    TLB_PTE_BITS", cfg.TLB_PTE_BITS)
 513         print ("    TLB_NUM_WAYS", cfg.TLB_NUM_WAYS)
 514
 515         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 516         tagway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_TAG_WAY_BITS)
 517         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 518         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 519                                     granularity=cfg.TLB_EA_TAG_BITS)
 520
 521         pteway = Memory(depth=cfg.TLB_SET_SIZE, width=cfg.TLB_PTE_WAY_BITS)
 522         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 523         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 524                                     granularity=cfg.TLB_PTE_BITS)
 525
 526         # commented out for now, can be put in if Memory.reset can be
 527         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 528         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 529         #m.submodules.rd_valid = rd_valid = validm.read_port()
 530         #m.submodules.wr_valid = wr_valid = validm.write_port(
 531                                     #granularity=1)
 532
 533         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 534         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 535         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 536         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 537         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 538         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 539         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 540
 541         updated  = Signal()
 542         v_updated  = Signal()
 543         tb_out = Signal(cfg.TLB_TAG_WAY_BITS) # tlb_way_tags_t
 544         db_out = Signal(cfg.TLB_NUM_WAYS)     # tlb_way_valids_t
 545         pb_out = Signal(cfg.TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 546         dv = Signal(cfg.TLB_NUM_WAYS) # tlb_way_valids_t
 547
 548         comb += dv.eq(dtlb_valid[tlb_req_index])
 549         comb += db_out.eq(dv)
 550
 551         with m.If(self.tlbie & self.doall):
 552             # clear all valid bits at once
 553             # XXX hmmm, validm _could_ use Memory reset here...
 554             for i in range(cfg.TLB_SET_SIZE):
 555                 sync += dtlb_valid[i].eq(0)
 556         with m.Elif(self.tlbie):
 557             # invalidate just the hit_way
 558             with m.If(self.tlb_hit.valid):
 559                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 560                 comb += v_updated.eq(1)
 561         with m.Elif(self.tlbwe):
 562             # write to the requested tag and PTE
 563             comb += cfg.write_tlb_tag(self.repl_way, tb_out, self.eatag)
 564             comb += cfg.write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 565             # set valid bit
 566             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 567
 568             comb += updated.eq(1)
 569             comb += v_updated.eq(1)
 570
 571         # above, sometimes valid is requested to be updated but data not
 572         # therefore split them out, here.  note the granularity thing matches
 573         # with the shift-up of the eatag/pte_data into the correct TLB way.
 574         # thus is it not necessary to write the entire lot, just the portion
 575         # being altered: hence writing the *old* copy of the row is not needed
 576         with m.If(updated): # PTE and TAG to be written
 577             comb += wr_pteway.data.eq(pb_out)
 578             comb += wr_pteway.en.eq(1<<self.repl_way)
 579             comb += wr_tagway.data.eq(tb_out)
 580             comb += wr_tagway.en.eq(1<<self.repl_way)
 581         with m.If(v_updated): # Valid to be written
 582             sync += dtlb_valid[tlb_req_index].eq(db_out)
 583             #comb += wr_valid.data.eq(db_out)
 584             #comb += wr_valid.en.eq(1<<self.repl_way)
 585
 586         # select one TLB way, use a register here
 587         r_delay = Signal()
 588         sync += r_delay.eq(self.tlb_read)
 589         # first deal with the valids, which are not in a Memory.
 590         # tlb way valid is output on a 1 clock delay with sync,
 591         # but have to explicitly deal with "forwarding" here
 592         with m.If(self.tlb_read):
 593             with m.If(v_updated): # write *and* read in same cycle: forward
 594                 sync += self.tlb_way.valid.eq(db_out)
 595             with m.Else():
 596                 sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 597         # now deal with the Memory-read case. the output must remain
 598         # valid (stable) even when a read-request is not made, but stable
 599         # on a one-clock delay, hence the register
 600         r_tlb_way        = cfg.TLBRecord("r_tlb_way")
 601         with m.If(r_delay):
 602             # on one clock delay, capture the contents of the read port(s)
 603             comb += self.tlb_way.tag.eq(rd_tagway.data)
 604             comb += self.tlb_way.pte.eq(rd_pteway.data)
 605             sync += r_tlb_way.tag.eq(rd_tagway.data)
 606             sync += r_tlb_way.pte.eq(rd_pteway.data)
 607         with m.Else():
 608             # ... so that the register can output it when no read is requested
 609             # it's rather overkill but better to be safe than sorry
 610             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 611             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 612             #comb += self.tlb_way.eq(r_tlb_way)
 613
 614         return m
 615
 616
 617 class DCachePendingHit(Elaboratable):
 618
 619     def __init__(self, cfg, tlb_way,
 620                       cache_i_validdx, cache_tag_set,
 621                     req_addr):
 622
 623         self.go          = Signal()
 624         self.virt_mode   = Signal()
 625         self.is_hit      = Signal()
 626         self.tlb_hit     = cfg.TLBHit("tlb_hit")
 627         self.hit_way     = Signal(cfg.WAY_BITS)
 628         self.rel_match   = Signal()
 629         self.req_index   = Signal(cfg.INDEX_BITS)
 630         self.reload_tag  = Signal(cfg.TAG_BITS)
 631
 632         self.tlb_way = tlb_way
 633         self.cache_i_validdx = cache_i_validdx
 634         self.cache_tag_set = cache_tag_set
 635         self.req_addr = req_addr
 636         self.cfg = cfg
 637
 638     def elaborate(self, platform):
 639         m = Module()
 640         comb = m.d.comb
 641         sync = m.d.sync
 642
 643         go = self.go
 644         virt_mode = self.virt_mode
 645         is_hit = self.is_hit
 646         tlb_way = self.tlb_way
 647         cache_i_validdx = self.cache_i_validdx
 648         cache_tag_set = self.cache_tag_set
 649         req_addr = self.req_addr
 650         tlb_hit = self.tlb_hit
 651         hit_way = self.hit_way
 652         rel_match = self.rel_match
 653         req_index = self.req_index
 654         reload_tag = self.reload_tag
 655         cfg = self.cfg
 656
 657         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 658                                   for i in range(cfg.TLB_NUM_WAYS))
 659         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 660                                     for i in range(cfg.TLB_NUM_WAYS))
 661         hit_way_set = cfg.HitWaySet()
 662
 663         # Test if pending request is a hit on any way
 664         # In order to make timing in virtual mode,
 665         # when we are using the TLB, we compare each
 666         # way with each of the real addresses from each way of
 667         # the TLB, and then decide later which match to use.
 668
 669         with m.If(virt_mode):
 670             for j in range(cfg.TLB_NUM_WAYS): # tlb_num_way_t
 671                 s_tag       = Signal(cfg.TAG_BITS, name="s_tag%d" % j)
 672                 s_hit       = Signal(name="s_hit%d" % j)
 673                 s_pte       = Signal(cfg.TLB_PTE_BITS, name="s_pte%d" % j)
 674                 s_ra        = Signal(cfg.REAL_ADDR_BITS, name="s_ra%d" % j)
 675                 # read the PTE, calc the Real Address, get tge tag
 676                 comb += s_pte.eq(cfg.read_tlb_pte(j, tlb_way.pte))
 677                 comb += s_ra.eq(Cat(req_addr[0:cfg.TLB_LG_PGSZ],
 678                                     s_pte[cfg.TLB_LG_PGSZ:cfg.REAL_ADDR_BITS]))
 679                 comb += s_tag.eq(cfg.get_tag(s_ra))
 680                 # for each way check tge tag against the cache tag set
 681                 for i in range(cfg.NUM_WAYS): # way_t
 682                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 683                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 684                                   (cfg.read_tag(i, cache_tag_set) == s_tag)
 685                                   & (tlb_way.valid[j]))
 686                     with m.If(is_tag_hit):
 687                         comb += hit_way_set[j].eq(i)
 688                         comb += s_hit.eq(1)
 689                 comb += hit_set[j].eq(s_hit)
 690                 comb += rel_matches[j].eq(s_tag == reload_tag)
 691             with m.If(tlb_hit.valid):
 692                 comb += is_hit.eq(hit_set[tlb_hit.way])
 693                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 694                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 695         with m.Else():
 696             s_tag       = Signal(cfg.TAG_BITS)
 697             comb += s_tag.eq(cfg.get_tag(req_addr))
 698             for i in range(cfg.NUM_WAYS): # way_t
 699                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 700                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 701                           (cfg.read_tag(i, cache_tag_set) == s_tag))
 702                 with m.If(is_tag_hit):
 703                     comb += hit_way.eq(i)
 704                     comb += is_hit.eq(1)
 705             with m.If(s_tag == reload_tag):
 706                 comb += rel_match.eq(1)
 707
 708         return m
 709
 710
 711 class DCache(Elaboratable, DCacheConfig):
 712     """Set associative dcache write-through
 713
 714     TODO (in no specific order):
 715     * See list in icache.vhdl
 716     * Complete load misses on the cycle when WB data comes instead of
 717       at the end of line (this requires dealing with requests coming in
 718       while not idle...)
 719     """
 720     def __init__(self, pspec=None):
 721         self.d_in      = LoadStore1ToDCacheType("d_in")
 722         self.d_out     = DCacheToLoadStore1Type("d_out")
 723
 724         self.m_in      = MMUToDCacheType("m_in")
 725         self.m_out     = DCacheToMMUType("m_out")
 726
 727         self.stall_out = Signal()
 728         self.any_stall_out = Signal()
 729         self.dreq_when_stall = Signal()
 730         self.mreq_when_stall = Signal()
 731
 732         # standard naming (wired to non-standard for compatibility)
 733         self.bus = Interface(addr_width=32,
 734                             data_width=64,
 735                             granularity=8,
 736                             features={'stall'},
 737                             #alignment=0,
 738                             name="dcache")
 739
 740         self.log_out   = Signal(20)
 741
 742         # test if microwatt compatibility is to be enabled
 743         self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
 744                                  (pspec.microwatt_compat == True))
 745
 746         if self.microwatt_compat:
 747             # reduce way sizes and num lines
 748             super().__init__(NUM_LINES = 8,
 749                               NUM_WAYS = 1,
 750                               TLB_NUM_WAYS = 1,
 751                               TLB_SET_SIZE=16) # XXX needs device-tree entry
 752         else:
 753             super().__init__()
 754
 755     def stage_0(self, m, r0, r1, r0_full):
 756         """Latch the request in r0.req as long as we're not stalling
 757         """
 758         comb = m.d.comb
 759         sync = m.d.sync
 760         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 761
 762         r = RegStage0("stage0")
 763
 764         # TODO, this goes in unit tests and formal proofs
 765         with m.If(d_in.valid & m_in.valid):
 766             sync += Display("request collision loadstore vs MMU")
 767
 768         with m.If(m_in.valid):
 769             comb += r.req.valid.eq(1)
 770             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 771             comb += r.req.dcbz.eq(0)
 772             comb += r.req.nc.eq(0)
 773             comb += r.req.reserve.eq(0)
 774             comb += r.req.virt_mode.eq(0)
 775             comb += r.req.priv_mode.eq(1)
 776             comb += r.req.addr.eq(m_in.addr)
 777             comb += r.req.data.eq(m_in.pte)
 778             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 779             comb += r.tlbie.eq(m_in.tlbie)
 780             comb += r.doall.eq(m_in.doall)
 781             comb += r.tlbld.eq(m_in.tlbld)
 782             comb += r.mmu_req.eq(1)
 783             comb += r.d_valid.eq(1)
 784             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 785                                  m_in.addr, m_in.pte, r.req.load)
 786
 787         with m.Else():
 788             comb += r.req.eq(d_in)
 789             comb += r.req.data.eq(0)
 790             comb += r.tlbie.eq(0)
 791             comb += r.doall.eq(0)
 792             comb += r.tlbld.eq(0)
 793             comb += r.mmu_req.eq(0)
 794             comb += r.d_valid.eq(0)
 795
 796         sync += r0_full.eq(0)
 797         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 798             sync += r0.eq(r)
 799             sync += r0_full.eq(r.req.valid)
 800         with m.Elif(~r0.d_valid):
 801             # Sample data the cycle after a request comes in from loadstore1.
 802             # If another request has come in already then the data will get
 803             # put directly into req.data below.
 804             sync += r0.req.data.eq(d_in.data)
 805             sync += r0.d_valid.eq(1)
 806         with m.If(d_in.valid):
 807             m.d.sync += Display("    DCACHE req cache "
 808                                 "virt %d addr %x data %x ld %d",
 809                                  r.req.virt_mode, r.req.addr,
 810                                  r.req.data, r.req.load)
 811
 812     def tlb_read(self, m, r0_stall, tlb_way):
 813         """TLB
 814         Operates in the second cycle on the request latched in r0.req.
 815         TLB updates write the entry at the end of the second cycle.
 816         """
 817         comb = m.d.comb
 818         sync = m.d.sync
 819         m_in, d_in = self.m_in, self.d_in
 820
 821         addrbits = Signal(self.TLB_SET_BITS)
 822
 823         amin = self.TLB_LG_PGSZ
 824         amax = self.TLB_LG_PGSZ + self.TLB_SET_BITS
 825
 826         with m.If(m_in.valid):
 827             comb += addrbits.eq(m_in.addr[amin : amax])
 828         with m.Else():
 829             comb += addrbits.eq(d_in.addr[amin : amax])
 830
 831         # If we have any op and the previous op isn't finished,
 832         # then keep the same output for next cycle.
 833         d = self.dtlb_update
 834         comb += d.tlb_read_index.eq(addrbits)
 835         comb += d.tlb_read.eq(~r0_stall)
 836         comb += tlb_way.eq(d.tlb_way)
 837
 838     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 839         """Generate TLB PLRUs
 840         """
 841         comb = m.d.comb
 842         sync = m.d.sync
 843
 844         if self.TLB_NUM_WAYS == 0:
 845             return
 846
 847         # suite of PLRUs with a selection and output mechanism
 848         tlb_plrus = PLRUs(self.TLB_SET_SIZE, self.TLB_WAY_BITS)
 849         m.submodules.tlb_plrus = tlb_plrus
 850         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 851         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 852         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 853         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 854         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 855
 856     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 857                    tlb_way,
 858                    pte, tlb_hit, valid_ra, perm_attr, ra):
 859
 860         comb = m.d.comb
 861
 862         hitway = Signal(self.TLB_WAY_BITS)
 863         hit    = Signal()
 864         eatag  = Signal(self.TLB_EA_TAG_BITS)
 865
 866         self.TLB_LG_END = self.TLB_LG_PGSZ + self.TLB_SET_BITS
 867         r0_req_addr = r0.req.addr[self.TLB_LG_PGSZ : self.TLB_LG_END]
 868         comb += tlb_req_index.eq(r0_req_addr)
 869         comb += eatag.eq(r0.req.addr[self.TLB_LG_END : 64 ])
 870
 871         for i in range(self.TLB_NUM_WAYS):
 872             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 873             tlb_tag = Signal(self.TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 874             comb += tlb_tag.eq(self.read_tlb_tag(i, tlb_way.tag))
 875             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 876             with m.If(is_tag_hit):
 877                 comb += hitway.eq(i)
 878                 comb += hit.eq(1)
 879
 880         comb += tlb_hit.valid.eq(hit & r0_valid)
 881         comb += tlb_hit.way.eq(hitway)
 882
 883         with m.If(tlb_hit.valid):
 884             comb += pte.eq(self.read_tlb_pte(hitway, tlb_way.pte))
 885         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 886
 887         with m.If(r0.req.virt_mode):
 888             comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
 889                               r0.req.addr[self.ROW_OFF_BITS:self.TLB_LG_PGSZ],
 890                               pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
 891             comb += perm_attr.reference.eq(pte[8])
 892             comb += perm_attr.changed.eq(pte[7])
 893             comb += perm_attr.nocache.eq(pte[5])
 894             comb += perm_attr.priv.eq(pte[3])
 895             comb += perm_attr.rd_perm.eq(pte[2])
 896             comb += perm_attr.wr_perm.eq(pte[1])
 897         with m.Else():
 898             comb += ra.eq(Cat(Const(0, self.ROW_OFF_BITS),
 899                           r0.req.addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS]))
 900             comb += perm_attr.reference.eq(1)
 901             comb += perm_attr.changed.eq(1)
 902             comb += perm_attr.nocache.eq(0)
 903             comb += perm_attr.priv.eq(1)
 904             comb += perm_attr.rd_perm.eq(1)
 905             comb += perm_attr.wr_perm.eq(1)
 906
 907         with m.If(valid_ra):
 908             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 909                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 910             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 911             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 912             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 913             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 914             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 915             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 916
 917     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 918                     tlb_hit, tlb_plru_victim):
 919
 920         comb = m.d.comb
 921         sync = m.d.sync
 922
 923         tlbie    = Signal()
 924         tlbwe    = Signal()
 925
 926         comb += tlbie.eq(r0_valid & r0.tlbie)
 927         comb += tlbwe.eq(r0_valid & r0.tlbld)
 928
 929         d = self.dtlb_update
 930
 931         comb += d.tlbie.eq(tlbie)
 932         comb += d.tlbwe.eq(tlbwe)
 933         comb += d.doall.eq(r0.doall)
 934         comb += d.tlb_hit.eq(tlb_hit)
 935         comb += d.tlb_req_index.eq(tlb_req_index)
 936
 937         with m.If(tlb_hit.valid):
 938             comb += d.repl_way.eq(tlb_hit.way)
 939         with m.Else():
 940             comb += d.repl_way.eq(tlb_plru_victim)
 941         comb += d.eatag.eq(r0.req.addr[self.TLB_LG_PGSZ + self.TLB_SET_BITS:64])
 942         comb += d.pte_data.eq(r0.req.data)
 943
 944     def maybe_plrus(self, m, r1, plru_victim):
 945         """Generate PLRUs
 946         """
 947         comb = m.d.comb
 948         sync = m.d.sync
 949
 950         if self.TLB_NUM_WAYS == 0:
 951             return
 952
 953         # suite of PLRUs with a selection and output mechanism
 954         m.submodules.plrus = plrus = PLRUs(self.NUM_LINES, self.WAY_BITS)
 955         comb += plrus.way.eq(r1.hit_way)
 956         comb += plrus.valid.eq(r1.cache_hit)
 957         comb += plrus.index.eq(r1.hit_index)
 958         comb += plrus.isel.eq(r1.store_index) # select victim
 959         comb += plru_victim.eq(plrus.o_index) # selected victim
 960
 961     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set):
 962         """Cache tag RAM read port
 963         """
 964         comb = m.d.comb
 965         sync = m.d.sync
 966
 967         m_in, d_in = self.m_in, self.d_in
 968
 969         # synchronous tag read-port
 970         m.submodules.rd_tag = rd_tag = self.tagmem.read_port()
 971
 972         index = Signal(self.INDEX_BITS)
 973
 974         with m.If(r0_stall):
 975             comb += index.eq(req_index)
 976         with m.Elif(m_in.valid):
 977             comb += index.eq(self.get_index(m_in.addr))
 978         with m.Else():
 979             comb += index.eq(self.get_index(d_in.addr))
 980         comb += rd_tag.addr.eq(index)
 981         comb += cache_tag_set.eq(rd_tag.data) # read-port is a 1-clock delay
 982
 983     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 984                        r0_valid, r1, cache_valids, replace_way,
 985                        use_forward1_next, use_forward2_next,
 986                        req_hit_way, plru_victim, rc_ok, perm_attr,
 987                        valid_ra, perm_ok, access_ok, req_op, req_go,
 988                        tlb_hit, tlb_way, cache_tag_set,
 989                        cancel_store, req_same_tag, r0_stall, early_req_row):
 990         """Cache request parsing and hit detection
 991         """
 992
 993         comb = m.d.comb
 994         m_in, d_in = self.m_in, self.d_in
 995
 996         is_hit      = Signal()
 997         hit_way     = Signal(self.WAY_BITS)
 998         op          = Signal(Op)
 999         opsel       = Signal(3)
1000         go          = Signal()
1001         nc          = Signal()
1002         cache_i_validdx = Signal(self.NUM_WAYS)
1003
1004         # Extract line, row and tag from request
1005         comb += req_index.eq(self.get_index(r0.req.addr))
1006         comb += req_row.eq(self.get_row(r0.req.addr))
1007         comb += req_tag.eq(self.get_tag(ra))
1008
1009         if False: # display on comb is a bit... busy.
1010             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
1011                     r0.req.addr, ra, req_index, req_tag, req_row)
1012
1013         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
1014         comb += cache_i_validdx.eq(cache_valids[req_index])
1015
1016         m.submodules.dcache_pend = dc = DCachePendingHit(self, tlb_way,
1017                                             cache_i_validdx, cache_tag_set,
1018                                             r0.req.addr)
1019         comb += dc.tlb_hit.eq(tlb_hit)
1020         comb += dc.reload_tag.eq(r1.reload_tag)
1021         comb += dc.virt_mode.eq(r0.req.virt_mode)
1022         comb += dc.go.eq(go)
1023         comb += dc.req_index.eq(req_index)
1024
1025         comb += is_hit.eq(dc.is_hit)
1026         comb += hit_way.eq(dc.hit_way)
1027         comb += req_same_tag.eq(dc.rel_match)
1028
1029         # See if the request matches the line currently being reloaded
1030         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
1031                   (req_index == r1.store_index) & req_same_tag):
1032             # For a store, consider this a hit even if the row isn't
1033             # valid since it will be by the time we perform the store.
1034             # For a load, check the appropriate row valid bit.
1035             rrow = Signal(self.ROW_LINE_BITS)
1036             comb += rrow.eq(req_row)
1037             valid = r1.rows_valid[rrow]
1038             comb += is_hit.eq((~r0.req.load) | valid)
1039             comb += hit_way.eq(replace_way)
1040
1041         # Whether to use forwarded data for a load or not
1042         with m.If((self.get_row(r1.req.real_addr) == req_row) &
1043                   (r1.req.hit_way == hit_way)):
1044             # Only need to consider r1.write_bram here, since if we
1045             # are writing refill data here, then we don't have a
1046             # cache hit this cycle on the line being refilled.
1047             # (There is the possibility that the load following the
1048             # load miss that started the refill could be to the old
1049             # contents of the victim line, since it is a couple of
1050             # cycles after the refill starts before we see the updated
1051             # cache tag. In that case we don't use the bypass.)
1052             comb += use_forward1_next.eq(r1.write_bram)
1053         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
1054             comb += use_forward2_next.eq(r1.forward_valid1)
1055
1056         # The way that matched on a hit
1057         comb += req_hit_way.eq(hit_way)
1058
1059         # The way to replace on a miss
1060         with m.If(r1.write_tag):
1061             comb += replace_way.eq(plru_victim)
1062         with m.Else():
1063             comb += replace_way.eq(r1.store_way)
1064
1065         # work out whether we have permission for this access
1066         # NB we don't yet implement AMR, thus no KUAP
1067         comb += rc_ok.eq(perm_attr.reference
1068                          & (r0.req.load | perm_attr.changed))
1069         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1070                            (perm_attr.wr_perm |
1071                               (r0.req.load & perm_attr.rd_perm)))
1072         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1073
1074         # Combine the request and cache hit status to decide what
1075         # operation needs to be done
1076         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1077         comb += op.eq(Op.OP_NONE)
1078         with m.If(go):
1079             with m.If(~access_ok):
1080                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1081                                  valid_ra, perm_ok, rc_ok)
1082                 comb += op.eq(Op.OP_BAD)
1083             with m.Elif(cancel_store):
1084                 m.d.sync += Display("DCACHE cancel store")
1085                 comb += op.eq(Op.OP_STCX_FAIL)
1086             with m.Else():
1087                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1088                                  valid_ra, nc, r0.req.load)
1089                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1090                 with m.Switch(opsel):
1091                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1092                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1093                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1094                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1095                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1096                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1097                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1098                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1099         comb += req_op.eq(op)
1100         comb += req_go.eq(go)
1101
1102         # Version of the row number that is valid one cycle earlier
1103         # in the cases where we need to read the cache data BRAM.
1104         # If we're stalling then we need to keep reading the last
1105         # row requested.
1106         with m.If(~r0_stall):
1107             with m.If(m_in.valid):
1108                 comb += early_req_row.eq(self.get_row(m_in.addr))
1109             with m.Else():
1110                 comb += early_req_row.eq(self.get_row(d_in.addr))
1111         with m.Else():
1112             comb += early_req_row.eq(req_row)
1113
1114     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1115                          r0_valid, r0, reservation):
1116         """Handle load-with-reservation and store-conditional instructions
1117         """
1118         comb = m.d.comb
1119
1120         with m.If(r0_valid & r0.req.reserve):
1121             # XXX generate alignment interrupt if address
1122             # is not aligned XXX or if r0.req.nc = '1'
1123             with m.If(r0.req.load):
1124                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1125             with m.Else():
1126                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1127                 with m.If((~reservation.valid) |
1128                          (r0.req.addr[self.LINE_OFF_BITS:64] !=
1129                           reservation.addr)):
1130                     comb += cancel_store.eq(1)
1131
1132     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1133                         reservation, r0):
1134         comb = m.d.comb
1135         sync = m.d.sync
1136
1137         with m.If(r0_valid & access_ok):
1138             with m.If(clear_rsrv):
1139                 sync += reservation.valid.eq(0)
1140             with m.Elif(set_rsrv):
1141                 sync += reservation.valid.eq(1)
1142                 sync += reservation.addr.eq(r0.req.addr[self.LINE_OFF_BITS:64])
1143
1144     def writeback_control(self, m, r1, cache_out_row):
1145         """Return data for loads & completion control logic
1146         """
1147         comb = m.d.comb
1148         sync = m.d.sync
1149         d_out, m_out = self.d_out, self.m_out
1150
1151         data_out = Signal(64)
1152         data_fwd = Signal(64)
1153
1154         # Use the bypass if are reading the row that was
1155         # written 1 or 2 cycles ago, including for the
1156         # slow_valid = 1 case (i.e. completing a load
1157         # miss or a non-cacheable load).
1158         with m.If(r1.use_forward1):
1159             comb += data_fwd.eq(r1.forward_data1)
1160         with m.Else():
1161             comb += data_fwd.eq(r1.forward_data2)
1162
1163         comb += data_out.eq(cache_out_row)
1164
1165         for i in range(8):
1166             with m.If(r1.forward_sel[i]):
1167                 dsel = data_fwd.word_select(i, 8)
1168                 comb += data_out.word_select(i, 8).eq(dsel)
1169
1170         # DCache output to LoadStore
1171         comb += d_out.valid.eq(r1.ls_valid)
1172         comb += d_out.data.eq(data_out)
1173         comb += d_out.store_done.eq(~r1.stcx_fail)
1174         comb += d_out.error.eq(r1.ls_error)
1175         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1176
1177         # Outputs to MMU
1178         comb += m_out.done.eq(r1.mmu_done)
1179         comb += m_out.err.eq(r1.mmu_error)
1180         comb += m_out.data.eq(data_out)
1181
1182         # We have a valid load or store hit or we just completed
1183         # a slow op such as a load miss, a NC load or a store
1184         #
1185         # Note: the load hit is delayed by one cycle. However it
1186         # can still not collide with r.slow_valid (well unless I
1187         # miscalculated) because slow_valid can only be set on a
1188         # subsequent request and not on its first cycle (the state
1189         # machine must have advanced), which makes slow_valid
1190         # at least 2 cycles from the previous hit_load_valid.
1191
1192         # Sanity: Only one of these must be set in any given cycle
1193
1194         if False: # TODO: need Display to get this to work
1195             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1196             "unexpected slow_valid collision with stcx_fail"
1197
1198             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1199              "unexpected hit_load_delayed collision with slow_valid"
1200
1201         with m.If(~r1.mmu_req):
1202             # Request came from loadstore1...
1203             # Load hit case is the standard path
1204             with m.If(r1.hit_load_valid):
1205                 sync += Display("completing load hit data=%x", data_out)
1206
1207             # error cases complete without stalling
1208             with m.If(r1.ls_error):
1209                 with m.If(r1.dcbz):
1210                     sync += Display("completing dcbz with error")
1211                 with m.Else():
1212                     sync += Display("completing ld/st with error")
1213
1214             # Slow ops (load miss, NC, stores)
1215             with m.If(r1.slow_valid):
1216                 sync += Display("completing store or load miss adr=%x data=%x",
1217                                 r1.req.real_addr, data_out)
1218
1219         with m.Else():
1220             # Request came from MMU
1221             with m.If(r1.hit_load_valid):
1222                 sync += Display("completing load hit to MMU, data=%x",
1223                                 m_out.data)
1224             # error cases complete without stalling
1225             with m.If(r1.mmu_error):
1226                 sync += Display("combpleting MMU ld with error")
1227
1228             # Slow ops (i.e. load miss)
1229             with m.If(r1.slow_valid):
1230                 sync += Display("completing MMU load miss, adr=%x data=%x",
1231                                 r1.req.real_addr, m_out.data)
1232
1233     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1234         """rams
1235         Generate a cache RAM for each way. This handles the normal
1236         reads, writes from reloads and the special store-hit update
1237         path as well.
1238
1239         Note: the BRAMs have an extra read buffer, meaning the output
1240         is pipelined an extra cycle. This differs from the
1241         icache. The writeback logic needs to take that into
1242         account by using 1-cycle delayed signals for load hits.
1243         """
1244         comb = m.d.comb
1245         bus = self.bus
1246
1247         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1248         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1249         m.submodules.rams_replace_way_e = rwe = Decoder(self.NUM_WAYS)
1250         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1251                    ~r1.write_bram))
1252         comb += rwe.i.eq(replace_way)
1253
1254         m.submodules.rams_hit_way_e = hwe = Decoder(self.NUM_WAYS)
1255         comb += hwe.i.eq(r1.hit_way)
1256
1257         # this one is gated with write_bram, and replace_way_e can never be
1258         # set at the same time.  that means that do_write can OR the outputs
1259         m.submodules.rams_hit_req_way_e = hre = Decoder(self.NUM_WAYS)
1260         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1261         comb += hre.i.eq(r1.req.hit_way)
1262
1263         # common Signals
1264         do_read  = Signal()
1265         wr_addr  = Signal(self.ROW_BITS)
1266         wr_data  = Signal(WB_DATA_BITS)
1267         wr_sel   = Signal(self.ROW_SIZE)
1268         rd_addr  = Signal(self.ROW_BITS)
1269
1270         comb += do_read.eq(1) # always enable
1271         comb += rd_addr.eq(early_req_row)
1272
1273         # Write mux:
1274         #
1275         # Defaults to wishbone read responses (cache refill)
1276         #
1277         # For timing, the mux on wr_data/sel/addr is not
1278         # dependent on anything other than the current state.
1279
1280         with m.If(r1.write_bram):
1281             # Write store data to BRAM.  This happens one
1282             # cycle after the store is in r0.
1283             comb += wr_data.eq(r1.req.data)
1284             comb += wr_sel.eq(r1.req.byte_sel)
1285             comb += wr_addr.eq(self.get_row(r1.req.real_addr))
1286
1287         with m.Else():
1288             # Otherwise, we might be doing a reload or a DCBZ
1289             with m.If(r1.dcbz):
1290                 comb += wr_data.eq(0)
1291             with m.Else():
1292                 comb += wr_data.eq(bus.dat_r)
1293             comb += wr_addr.eq(r1.store_row)
1294             comb += wr_sel.eq(~0) # all 1s
1295
1296         # set up Cache Rams
1297         for i in range(self.NUM_WAYS):
1298             do_write = Signal(name="do_wr%d" % i)
1299             wr_sel_m = Signal(self.ROW_SIZE, name="wr_sel_m_%d" % i)
1300             d_out= Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1301
1302             way = CacheRam(self.ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1303             m.submodules["cacheram_%d" % i] = way
1304
1305             comb += way.rd_en.eq(do_read)
1306             comb += way.rd_addr.eq(rd_addr)
1307             comb += d_out.eq(way.rd_data_o)
1308             comb += way.wr_sel.eq(wr_sel_m)
1309             comb += way.wr_addr.eq(wr_addr)
1310             comb += way.wr_data.eq(wr_data)
1311
1312             # Cache hit reads
1313             with m.If(hwe.o[i]):
1314                 comb += cache_out_row.eq(d_out)
1315
1316             # these are mutually-exclusive via their Decoder-enablers
1317             # (note: Decoder-enable is inverted)
1318             comb += do_write.eq(hre.o[i] | rwe.o[i])
1319
1320             # Mask write selects with do_write since BRAM
1321             # doesn't have a global write-enable
1322             with m.If(do_write):
1323                 comb += wr_sel_m.eq(wr_sel)
1324
1325     # Cache hit synchronous machine for the easy case.
1326     # This handles load hits.
1327     # It also handles error cases (TLB miss, cache paradox)
1328     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1329                         req_hit_way, req_index, req_tag, access_ok,
1330                         tlb_hit, tlb_req_index):
1331         comb = m.d.comb
1332         sync = m.d.sync
1333
1334         with m.If(req_op != Op.OP_NONE):
1335             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1336                     req_op, r0.req.addr, r0.req.nc,
1337                     req_index, req_tag, req_hit_way)
1338
1339         with m.If(r0_valid):
1340             sync += r1.mmu_req.eq(r0.mmu_req)
1341
1342         # Fast path for load/store hits.
1343         # Set signals for the writeback controls.
1344         sync += r1.hit_way.eq(req_hit_way)
1345         sync += r1.hit_index.eq(req_index)
1346
1347         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1348         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1349                                 (req_op == Op.OP_STORE_HIT))
1350
1351         with m.If(req_op == Op.OP_BAD):
1352             sync += Display("Signalling ld/st error "
1353                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1354                             ~r0.mmu_req,r0.mmu_req,access_ok)
1355             sync += r1.ls_error.eq(~r0.mmu_req)
1356             sync += r1.mmu_error.eq(r0.mmu_req)
1357             sync += r1.cache_paradox.eq(access_ok)
1358         with m.Else():
1359             sync += r1.ls_error.eq(0)
1360             sync += r1.mmu_error.eq(0)
1361             sync += r1.cache_paradox.eq(0)
1362
1363         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1364
1365         # Record TLB hit information for updating TLB PLRU
1366         sync += r1.tlb_hit.eq(tlb_hit)
1367         sync += r1.tlb_hit_index.eq(tlb_req_index)
1368
1369     # Memory accesses are handled by this state machine:
1370     #
1371     #   * Cache load miss/reload (in conjunction with "rams")
1372     #   * Load hits for non-cachable forms
1373     #   * Stores (the collision case is handled in "rams")
1374     #
1375     # All wishbone requests generation is done here.
1376     # This machine operates at stage 1.
1377     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1378                     r0, replace_way,
1379                     req_hit_way, req_same_tag,
1380                     r0_valid, req_op, cache_valids, req_go, ra):
1381
1382         comb = m.d.comb
1383         sync = m.d.sync
1384         bus = self.bus
1385         d_in = self.d_in
1386
1387         m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
1388                                                     granularity=self.TAG_WIDTH)
1389
1390         req         = MemAccessRequest(self, "mreq_ds")
1391
1392         r1_next_cycle = Signal()
1393         req_row = Signal(self.ROW_BITS)
1394         req_idx = Signal(self.INDEX_BITS)
1395         req_tag = Signal(self.TAG_BITS)
1396         comb += req_idx.eq(self.get_index(req.real_addr))
1397         comb += req_row.eq(self.get_row(req.real_addr))
1398         comb += req_tag.eq(self.get_tag(req.real_addr))
1399
1400         sync += r1.use_forward1.eq(use_forward1_next)
1401         sync += r1.forward_sel.eq(0)
1402
1403         with m.If(use_forward1_next):
1404             sync += r1.forward_sel.eq(r1.req.byte_sel)
1405         with m.Elif(use_forward2_next):
1406             sync += r1.forward_sel.eq(r1.forward_sel1)
1407
1408         sync += r1.forward_data2.eq(r1.forward_data1)
1409         with m.If(r1.write_bram):
1410             sync += r1.forward_data1.eq(r1.req.data)
1411             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1412             sync += r1.forward_way1.eq(r1.req.hit_way)
1413             sync += r1.forward_row1.eq(self.get_row(r1.req.real_addr))
1414             sync += r1.forward_valid1.eq(1)
1415         with m.Else():
1416             with m.If(r1.dcbz):
1417                 sync += r1.forward_data1.eq(0)
1418             with m.Else():
1419                 sync += r1.forward_data1.eq(bus.dat_r)
1420             sync += r1.forward_sel1.eq(~0) # all 1s
1421             sync += r1.forward_way1.eq(replace_way)
1422             sync += r1.forward_row1.eq(r1.store_row)
1423             sync += r1.forward_valid1.eq(0)
1424
1425         # One cycle pulses reset
1426         sync += r1.slow_valid.eq(0)
1427         sync += r1.write_bram.eq(0)
1428         sync += r1.inc_acks.eq(0)
1429         sync += r1.dec_acks.eq(0)
1430
1431         sync += r1.ls_valid.eq(0)
1432         # complete tlbies and TLB loads in the third cycle
1433         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1434
1435         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1436             with m.If(r0.mmu_req):
1437                 sync += r1.mmu_done.eq(1)
1438             with m.Else():
1439                 sync += r1.ls_valid.eq(1)
1440
1441         with m.If(r1.write_tag):
1442             # Store new tag in selected way
1443             replace_way_onehot = Signal(self.NUM_WAYS)
1444             comb += replace_way_onehot.eq(1<<replace_way)
1445             ct = Signal(self.TAG_RAM_WIDTH)
1446             comb += ct.eq(r1.reload_tag << (replace_way*self.TAG_WIDTH))
1447             comb += wr_tag.en.eq(replace_way_onehot)
1448             comb += wr_tag.addr.eq(r1.store_index)
1449             comb += wr_tag.data.eq(ct)
1450
1451             sync += r1.store_way.eq(replace_way)
1452             sync += r1.write_tag.eq(0)
1453
1454         # Take request from r1.req if there is one there,
1455         # else from req_op, ra, etc.
1456         with m.If(r1.full):
1457             comb += req.eq(r1.req)
1458         with m.Else():
1459             comb += req.op.eq(req_op)
1460             comb += req.valid.eq(req_go)
1461             comb += req.mmu_req.eq(r0.mmu_req)
1462             comb += req.dcbz.eq(r0.req.dcbz)
1463             comb += req.real_addr.eq(ra)
1464
1465             with m.If(r0.req.dcbz):
1466                 # force data to 0 for dcbz
1467                 comb += req.data.eq(0)
1468             with m.Elif(r0.d_valid):
1469                 comb += req.data.eq(r0.req.data)
1470             with m.Else():
1471                 comb += req.data.eq(d_in.data)
1472
1473             # Select all bytes for dcbz
1474             # and for cacheable loads
1475             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1476                 comb += req.byte_sel.eq(~0) # all 1s
1477             with m.Else():
1478                 comb += req.byte_sel.eq(r0.req.byte_sel)
1479             comb += req.hit_way.eq(req_hit_way)
1480             comb += req.same_tag.eq(req_same_tag)
1481
1482             # Store the incoming request from r0,
1483             # if it is a slow request
1484             # Note that r1.full = 1 implies req_op = OP_NONE
1485             with m.If((req_op == Op.OP_LOAD_MISS)
1486                       | (req_op == Op.OP_LOAD_NC)
1487                       | (req_op == Op.OP_STORE_MISS)
1488                       | (req_op == Op.OP_STORE_HIT)):
1489                 sync += r1.req.eq(req)
1490                 sync += r1.full.eq(1)
1491                 # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
1492                 # destroy r1.req by overwriting r1.full back to zero
1493                 comb += r1_next_cycle.eq(1)
1494
1495         # Main state machine
1496         with m.Switch(r1.state):
1497
1498             with m.Case(State.IDLE):
1499                 sync += r1.wb.adr.eq(req.real_addr[self.ROW_OFF_BITS:])
1500                 sync += r1.wb.sel.eq(req.byte_sel)
1501                 sync += r1.wb.dat.eq(req.data)
1502                 sync += r1.dcbz.eq(req.dcbz)
1503
1504                 # Keep track of our index and way
1505                 # for subsequent stores.
1506                 sync += r1.store_index.eq(req_idx)
1507                 sync += r1.store_row.eq(req_row)
1508                 sync += r1.end_row_ix.eq(self.get_row_of_line(req_row)-1)
1509                 sync += r1.reload_tag.eq(req_tag)
1510                 sync += r1.req.same_tag.eq(1)
1511
1512                 with m.If(req.op == Op.OP_STORE_HIT):
1513                     sync += r1.store_way.eq(req.hit_way)
1514
1515                 #with m.If(r1.dec_acks):
1516                 #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
1517
1518                 # Reset per-row valid bits,
1519                 # ready for handling OP_LOAD_MISS
1520                 for i in range(self.ROW_PER_LINE):
1521                     sync += r1.rows_valid[i].eq(0)
1522
1523                 with m.If(req_op != Op.OP_NONE):
1524                     sync += Display("cache op %d", req.op)
1525
1526                 with m.Switch(req.op):
1527                     with m.Case(Op.OP_LOAD_HIT):
1528                         # stay in IDLE state
1529                         pass
1530
1531                     with m.Case(Op.OP_LOAD_MISS):
1532                         sync += Display("cache miss real addr: %x " \
1533                                 "idx: %x tag: %x",
1534                                 req.real_addr, req_row, req_tag)
1535
1536                         # Start the wishbone cycle
1537                         sync += r1.wb.we.eq(0)
1538                         sync += r1.wb.cyc.eq(1)
1539                         sync += r1.wb.stb.eq(1)
1540
1541                         # Track that we had one request sent
1542                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1543                         sync += r1.write_tag.eq(1)
1544
1545                     with m.Case(Op.OP_LOAD_NC):
1546                         sync += r1.wb.cyc.eq(1)
1547                         sync += r1.wb.stb.eq(1)
1548                         sync += r1.wb.we.eq(0)
1549                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1550
1551                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1552                         with m.If(~req.dcbz):
1553                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1554                             sync += r1.acks_pending.eq(1)
1555                             sync += r1.full.eq(0)
1556                             comb += r1_next_cycle.eq(0)
1557                             sync += r1.slow_valid.eq(1)
1558
1559                             with m.If(req.mmu_req):
1560                                 sync += r1.mmu_done.eq(1)
1561                             with m.Else():
1562                                 sync += r1.ls_valid.eq(1)
1563
1564                             with m.If(req.op == Op.OP_STORE_HIT):
1565                                 sync += r1.write_bram.eq(1)
1566                         with m.Else():
1567                             # dcbz is handled much like a load miss except
1568                             # that we are writing to memory instead of reading
1569                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1570
1571                             with m.If(req.op == Op.OP_STORE_MISS):
1572                                 sync += r1.write_tag.eq(1)
1573
1574                         sync += r1.wb.we.eq(1)
1575                         sync += r1.wb.cyc.eq(1)
1576                         sync += r1.wb.stb.eq(1)
1577
1578                     # OP_NONE and OP_BAD do nothing
1579                     # OP_BAD & OP_STCX_FAIL were
1580                     # handled above already
1581                     with m.Case(Op.OP_NONE):
1582                         pass
1583                     with m.Case(Op.OP_BAD):
1584                         pass
1585                     with m.Case(Op.OP_STCX_FAIL):
1586                         pass
1587
1588             with m.Case(State.RELOAD_WAIT_ACK):
1589                 ld_stbs_done = Signal()
1590                 # Requests are all sent if stb is 0
1591                 comb += ld_stbs_done.eq(~r1.wb.stb)
1592
1593                 # If we are still sending requests, was one accepted?
1594                 with m.If((~bus.stall) & r1.wb.stb):
1595                     # That was the last word?  We are done sending.
1596                     # Clear stb and set ld_stbs_done so we can handle an
1597                     # eventual last ack on the same cycle.
1598                     # sigh - reconstruct wb adr with 3 extra 0s at front
1599                     wb_adr = Cat(Const(0, self.ROW_OFF_BITS), r1.wb.adr)
1600                     with m.If(self.is_last_row_addr(wb_adr, r1.end_row_ix)):
1601                         sync += r1.wb.stb.eq(0)
1602                         comb += ld_stbs_done.eq(1)
1603
1604                     # Calculate the next row address in the current cache line
1605                     rlen = self.LINE_OFF_BITS-self.ROW_OFF_BITS
1606                     row = Signal(rlen)
1607                     comb += row.eq(r1.wb.adr)
1608                     sync += r1.wb.adr[:rlen].eq(row+1)
1609
1610                 # Incoming acks processing
1611                 sync += r1.forward_valid1.eq(bus.ack)
1612                 with m.If(bus.ack):
1613                     srow = Signal(self.ROW_LINE_BITS)
1614                     comb += srow.eq(r1.store_row)
1615                     sync += r1.rows_valid[srow].eq(1)
1616
1617                     # If this is the data we were looking for,
1618                     # we can complete the request next cycle.
1619                     # Compare the whole address in case the
1620                     # request in r1.req is not the one that
1621                     # started this refill.
1622                     rowmatch = Signal()
1623                     lastrow = Signal()
1624                     comb += rowmatch.eq(r1.store_row ==
1625                                         self.get_row(r1.req.real_addr))
1626                     comb += lastrow.eq(self.is_last_row(r1.store_row,
1627                                                       r1.end_row_ix))
1628                     with m.If(r1.full & r1.req.same_tag &
1629                               ((r1.dcbz & req.dcbz) |
1630                                (r1.req.op == Op.OP_LOAD_MISS)) & rowmatch):
1631                         sync += r1.full.eq(r1_next_cycle)
1632                         sync += r1.slow_valid.eq(1)
1633                         with m.If(r1.mmu_req):
1634                             sync += r1.mmu_done.eq(1)
1635                         with m.Else():
1636                             sync += r1.ls_valid.eq(1)
1637                         sync += r1.forward_sel.eq(~0) # all 1s
1638                         sync += r1.use_forward1.eq(1)
1639
1640                     # Check for completion
1641                     with m.If(ld_stbs_done & lastrow):
1642                         # Complete wishbone cycle
1643                         sync += r1.wb.cyc.eq(0)
1644
1645                         # Cache line is now valid
1646                         cv = Signal(self.INDEX_BITS)
1647                         comb += cv.eq(cache_valids[r1.store_index])
1648                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1649                         sync += cache_valids[r1.store_index].eq(cv)
1650
1651                         sync += r1.state.eq(State.IDLE)
1652                         sync += Display("cache valid set %x "
1653                                         "idx %d way %d",
1654                                          cv, r1.store_index, r1.store_way)
1655
1656                     # Increment store row counter
1657                     sync += r1.store_row.eq(self.next_row(r1.store_row))
1658
1659             with m.Case(State.STORE_WAIT_ACK):
1660                 st_stbs_done = Signal()
1661                 adjust_acks = Signal(3)
1662
1663                 comb += st_stbs_done.eq(~r1.wb.stb)
1664
1665                 with m.If(r1.inc_acks != r1.dec_acks):
1666                     with m.If(r1.inc_acks):
1667                         comb += adjust_acks.eq(r1.acks_pending + 1)
1668                     with m.Else():
1669                         comb += adjust_acks.eq(r1.acks_pending - 1)
1670                 with m.Else():
1671                     comb += adjust_acks.eq(r1.acks_pending)
1672
1673                 sync += r1.acks_pending.eq(adjust_acks)
1674
1675                 # Clear stb when slave accepted request
1676                 with m.If(~bus.stall):
1677                     # See if there is another store waiting
1678                     # to be done which is in the same real page.
1679                     # (this is when same_tsg is true)
1680                     with m.If(req.valid):
1681                         _ra = req.real_addr[self.ROW_OFF_BITS:
1682                                             self.SET_SIZE_BITS]
1683                         alen = self.SET_SIZE_BITS-self.ROW_OFF_BITS
1684                         sync += r1.wb.adr[0:alen].eq(_ra)
1685                         sync += r1.wb.dat.eq(req.data)
1686                         sync += r1.wb.sel.eq(req.byte_sel)
1687
1688                     with m.If((adjust_acks < 7) & req.same_tag &
1689                                 ((req.op == Op.OP_STORE_MISS) |
1690                                  (req.op == Op.OP_STORE_HIT))):
1691                         sync += r1.wb.stb.eq(1)
1692                         comb += st_stbs_done.eq(0)
1693                         sync += r1.store_way.eq(req.hit_way)
1694                         sync += r1.store_row.eq(self.get_row(req.real_addr))
1695
1696                         with m.If(req.op == Op.OP_STORE_HIT):
1697                             sync += r1.write_bram.eq(1)
1698                         sync += r1.full.eq(r1_next_cycle)
1699                         sync += r1.slow_valid.eq(1)
1700
1701                         # Store requests never come from the MMU
1702                         sync += r1.ls_valid.eq(1)
1703                         comb += st_stbs_done.eq(0)
1704                         sync += r1.inc_acks.eq(1)
1705                     with m.Else():
1706                         sync += r1.wb.stb.eq(0)
1707                         comb += st_stbs_done.eq(1)
1708
1709                 # Got ack ? See if complete.
1710                 sync += Display("got ack %d %d stbs %d adjust_acks %d",
1711                                 bus.ack, bus.ack, st_stbs_done, adjust_acks)
1712                 with m.If(bus.ack):
1713                     with m.If(st_stbs_done & (adjust_acks == 1)):
1714                         sync += r1.state.eq(State.IDLE)
1715                         sync += r1.wb.cyc.eq(0)
1716                         sync += r1.wb.stb.eq(0)
1717                     sync += r1.dec_acks.eq(1)
1718
1719             with m.Case(State.NC_LOAD_WAIT_ACK):
1720                 # Clear stb when slave accepted request
1721                 with m.If(~bus.stall):
1722                     sync += r1.wb.stb.eq(0)
1723
1724                 # Got ack ? complete.
1725                 with m.If(bus.ack):
1726                     sync += r1.state.eq(State.IDLE)
1727                     sync += r1.full.eq(r1_next_cycle)
1728                     sync += r1.slow_valid.eq(1)
1729
1730                     with m.If(r1.mmu_req):
1731                         sync += r1.mmu_done.eq(1)
1732                     with m.Else():
1733                         sync += r1.ls_valid.eq(1)
1734
1735                     sync += r1.forward_sel.eq(~0) # all 1s
1736                     sync += r1.use_forward1.eq(1)
1737                     sync += r1.wb.cyc.eq(0)
1738                     sync += r1.wb.stb.eq(0)
1739
1740     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1741
1742         sync = m.d.sync
1743         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1744
1745         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1746                                stall_out, req_op[:3], d_out.valid, d_out.error,
1747                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1748                                r1.real_adr[3:6]))
1749
1750     def elaborate(self, platform):
1751
1752         m = Module()
1753         comb, sync = m.d.comb, m.d.sync
1754         m_in, d_in = self.m_in, self.d_in
1755
1756         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1757         cache_valids     = self.CacheValidsArray()
1758         cache_tag_set    = Signal(self.TAG_RAM_WIDTH)
1759
1760         self.tagmem = Memory(depth=self.NUM_LINES, width=self.TAG_RAM_WIDTH)
1761
1762         """note: these are passed to nmigen.hdl.Memory as "attributes".
1763            don't know how, just that they are.
1764         """
1765         # TODO attribute ram_style of
1766         #  dtlb_tags : signal is "distributed";
1767         # TODO attribute ram_style of
1768         #  dtlb_ptes : signal is "distributed";
1769
1770         r0      = RegStage0("r0")
1771         r0_full = Signal()
1772
1773         r1 = RegStage1(self, "r1")
1774
1775         reservation = Reservation(self, "rsrv")
1776
1777         # Async signals on incoming request
1778         req_index    = Signal(self.INDEX_BITS)
1779         req_row      = Signal(self.ROW_BITS)
1780         req_hit_way  = Signal(self.WAY_BITS)
1781         req_tag      = Signal(self.TAG_BITS)
1782         req_op       = Signal(Op)
1783         req_data     = Signal(64)
1784         req_same_tag = Signal()
1785         req_go       = Signal()
1786
1787         early_req_row     = Signal(self.ROW_BITS)
1788
1789         cancel_store      = Signal()
1790         set_rsrv          = Signal()
1791         clear_rsrv        = Signal()
1792
1793         r0_valid          = Signal()
1794         r0_stall          = Signal()
1795
1796         use_forward1_next = Signal()
1797         use_forward2_next = Signal()
1798
1799         cache_out_row     = Signal(WB_DATA_BITS)
1800
1801         plru_victim       = Signal(self.WAY_BITS)
1802         replace_way       = Signal(self.WAY_BITS)
1803
1804         # Wishbone read/write/cache write formatting signals
1805         bus_sel           = Signal(8)
1806
1807         # TLB signals
1808         tlb_way       = self.TLBRecord("tlb_way")
1809         tlb_req_index = Signal(self.TLB_SET_BITS)
1810         tlb_hit       = self.TLBHit("tlb_hit")
1811         pte           = Signal(self.TLB_PTE_BITS)
1812         ra            = Signal(self.REAL_ADDR_BITS)
1813         valid_ra      = Signal()
1814         perm_attr     = PermAttr("dc_perms")
1815         rc_ok         = Signal()
1816         perm_ok       = Signal()
1817         access_ok     = Signal()
1818
1819         tlb_plru_victim = Signal(self.TLB_WAY_BITS)
1820
1821         # we don't yet handle collisions between loadstore1 requests
1822         # and MMU requests
1823         comb += self.m_out.stall.eq(0)
1824
1825         # Hold off the request in r0 when r1 has an uncompleted request
1826         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1827         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1828         comb += self.stall_out.eq(r0_stall)
1829         # debugging: detect if any stall ever requested, which is fine,
1830         # but if a request comes in when stall requested, that's bad.
1831         with m.If(r0_stall):
1832             sync += self.any_stall_out.eq(1)
1833             with m.If(d_in.valid):
1834                 sync += self.dreq_when_stall.eq(1)
1835             with m.If(m_in.valid):
1836                 sync += self.mreq_when_stall.eq(1)
1837
1838         # deal with litex not doing wishbone pipeline mode
1839         # XXX in wrong way.  FIFOs are needed in the SRAM test
1840         # so that stb/ack match up. same thing done in icache.py
1841         if not self.microwatt_compat:
1842             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1843
1844         # Wire up wishbone request latch out of stage 1
1845         comb += self.bus.we.eq(r1.wb.we)
1846         comb += self.bus.adr.eq(r1.wb.adr)
1847         comb += self.bus.sel.eq(r1.wb.sel)
1848         comb += self.bus.stb.eq(r1.wb.stb)
1849         comb += self.bus.dat_w.eq(r1.wb.dat)
1850         comb += self.bus.cyc.eq(r1.wb.cyc)
1851
1852         # create submodule TLBUpdate
1853         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate(self)
1854
1855         # call sub-functions putting everything together, using shared
1856         # signals established above
1857         self.stage_0(m, r0, r1, r0_full)
1858         self.tlb_read(m, r0_stall, tlb_way)
1859         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1860                         tlb_way,
1861                         pte, tlb_hit, valid_ra, perm_attr, ra)
1862         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1863                         tlb_hit, tlb_plru_victim)
1864         self.maybe_plrus(m, r1, plru_victim)
1865         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1866         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set)
1867         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1868                            r0_valid, r1, cache_valids, replace_way,
1869                            use_forward1_next, use_forward2_next,
1870                            req_hit_way, plru_victim, rc_ok, perm_attr,
1871                            valid_ra, perm_ok, access_ok, req_op, req_go,
1872                            tlb_hit, tlb_way, cache_tag_set,
1873                            cancel_store, req_same_tag, r0_stall, early_req_row)
1874         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1875                            r0_valid, r0, reservation)
1876         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1877                            reservation, r0)
1878         self.writeback_control(m, r1, cache_out_row)
1879         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1880         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1881                         req_hit_way, req_index, req_tag, access_ok,
1882                         tlb_hit, tlb_req_index)
1883         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1884                     r0, replace_way,
1885                     req_hit_way, req_same_tag,
1886                          r0_valid, req_op, cache_valids, req_go, ra)
1887         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1888
1889         return m
1890
1891
1892 if __name__ == '__main__':
1893     dut = DCache()
1894     vl = rtlil.convert(dut, ports=[])
1895     with open("test_dcache.il", "w") as f:
1896         f.write(vl)