src/soc/experiment/icache.py

   1 """ICache
   2
   3 based on Anton Blanchard microwatt icache.vhdl
   4
   5 Set associative icache
   6
   7 TODO (in no specific order):
   8 * Add debug interface to inspect cache content
   9 * Add snoop/invalidate path
  10 * Add multi-hit error detection
  11 * Pipelined bus interface (wb or axi)
  12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
  13 * Add optimization: service hits on partially loaded lines
  14 * Add optimization: (maybe) interrupt reload on fluch/redirect
  15 * Check if playing with the geometry of the cache tags allow for more
  16   efficient use of distributed RAM and less logic/muxes. Currently we
  17   write TAG_BITS width which may not match full ram blocks and might
  18   cause muxes to be inferred for "partial writes".
  19 * Check if making the read size of PLRU a ROM helps utilization
  20
  21 Links:
  22
  23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
  24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  25   (discussion about brams for ECP5)
  26
  27 """
  28
  29 from enum import (Enum, unique)
  30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
  31                     Record)
  32 from nmigen.cli import main, rtlil
  33 from nmutil.iocontrol import RecordObject
  34 from nmigen.utils import log2_int
  35 from nmigen.lib.coding import Decoder
  36 from nmutil.util import Display
  37
  38 #from nmutil.plru import PLRU
  39 from soc.experiment.plru import PLRU, PLRUs
  40 from soc.experiment.cache_ram import CacheRam
  41
  42 from soc.experiment.mem_types import (Fetch1ToICacheType,
  43                                       ICacheToDecode1Type,
  44                                       MMUToICacheType)
  45
  46 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
  47                                      WB_SEL_BITS, WBAddrType, WBDataType,
  48                                      WBSelType, WBMasterOut, WBSlaveOut,
  49                                      )
  50
  51 from nmigen_soc.wishbone.bus import Interface
  52 from soc.minerva.units.fetch import FetchUnitInterface
  53
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmutil.util import wrap
  59 from nmigen.cli import main, rtlil
  60
  61 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  62 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  63 from nmutil.sim_tmp_alternative import Simulator, Settle
  64
  65
  66 SIM            = 0
  67 LINE_SIZE      = 64
  68 # BRAM organisation: We never access more than wishbone_data_bits
  69 # at a time so to save resources we make the array only that wide,
  70 # and use consecutive indices for to make a cache "line"
  71 #
  72 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
  73 ROW_SIZE       = WB_DATA_BITS // 8
  74 # Number of lines in a set
  75 NUM_LINES      = 64
  76 # Number of ways
  77 NUM_WAYS       = 2
  78 # L1 ITLB number of entries (direct mapped)
  79 TLB_SIZE       = 64
  80 # L1 ITLB log_2(page_size)
  81 TLB_LG_PGSZ    = 12
  82 # Number of real address bits that we store
  83 REAL_ADDR_BITS = 56
  84 # Non-zero to enable log data collection
  85 LOG_LENGTH     = 0
  86
  87 ROW_SIZE_BITS  = ROW_SIZE * 8
  88 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
  89 ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
  90 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
  91 BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
  92 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
  93 INSN_PER_ROW   = ROW_SIZE_BITS // 32
  94
  95 # Bit fields counts in the address
  96 #
  97 # INSN_BITS is the number of bits to select an instruction in a row
  98 INSN_BITS      = log2_int(INSN_PER_ROW)
  99 # ROW_BITS is the number of bits to select a row
 100 ROW_BITS       = log2_int(BRAM_ROWS)
 101 # ROW_LINE_BITS is the number of bits to select a row within a line
 102 ROW_LINE_BITS  = log2_int(ROW_PER_LINE)
 103 # LINE_OFF_BITS is the number of bits for the offset in a cache line
 104 LINE_OFF_BITS  = log2_int(LINE_SIZE)
 105 # ROW_OFF_BITS is the number of bits for the offset in a row
 106 ROW_OFF_BITS   = log2_int(ROW_SIZE)
 107 # INDEX_BITS is the number of bits to select a cache line
 108 INDEX_BITS     = log2_int(NUM_LINES)
 109 # SET_SIZE_BITS is the log base 2 of the set size
 110 SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
 111 # TAG_BITS is the number of bits of the tag part of the address
 112 TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
 113 # TAG_WIDTH is the width in bits of each way of the tag RAM
 114 TAG_WIDTH      = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 115
 116 # WAY_BITS is the number of bits to select a way
 117 WAY_BITS       = log2_int(NUM_WAYS)
 118 TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
 119
 120 # L1 ITLB
 121 TLB_BITS        = log2_int(TLB_SIZE)
 122 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
 123 TLB_PTE_BITS    = 64
 124
 125 print("BRAM_ROWS       =", BRAM_ROWS)
 126 print("INDEX_BITS      =", INDEX_BITS)
 127 print("INSN_BITS       =", INSN_BITS)
 128 print("INSN_PER_ROW    =", INSN_PER_ROW)
 129 print("LINE_SIZE       =", LINE_SIZE)
 130 print("LINE_OFF_BITS   =", LINE_OFF_BITS)
 131 print("LOG_LENGTH      =", LOG_LENGTH)
 132 print("NUM_LINES       =", NUM_LINES)
 133 print("NUM_WAYS        =", NUM_WAYS)
 134 print("REAL_ADDR_BITS  =", REAL_ADDR_BITS)
 135 print("ROW_BITS        =", ROW_BITS)
 136 print("ROW_OFF_BITS    =", ROW_OFF_BITS)
 137 print("ROW_LINE_BITS   =", ROW_LINE_BITS)
 138 print("ROW_PER_LINE    =", ROW_PER_LINE)
 139 print("ROW_SIZE        =", ROW_SIZE)
 140 print("ROW_SIZE_BITS   =", ROW_SIZE_BITS)
 141 print("SET_SIZE_BITS   =", SET_SIZE_BITS)
 142 print("SIM             =", SIM)
 143 print("TAG_BITS        =", TAG_BITS)
 144 print("TAG_RAM_WIDTH   =", TAG_RAM_WIDTH)
 145 print("TAG_BITS        =", TAG_BITS)
 146 print("TLB_BITS        =", TLB_BITS)
 147 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
 148 print("TLB_LG_PGSZ     =", TLB_LG_PGSZ)
 149 print("TLB_PTE_BITS    =", TLB_PTE_BITS)
 150 print("TLB_SIZE        =", TLB_SIZE)
 151 print("WAY_BITS        =", WAY_BITS)
 152
 153 # from microwatt/utils.vhdl
 154 def ispow2(n):
 155     return n != 0 and (n & (n - 1)) == 0
 156
 157 assert LINE_SIZE % ROW_SIZE == 0
 158 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 159 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 160 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 161 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
 162 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
 163     "geometry bits don't add up"
 164 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
 165    "geometry bits don't add up"
 166 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
 167     "geometry bits don't add up"
 168 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
 169     "geometry bits don't add up"
 170
 171 # Example of layout for 32 lines of 64 bytes:
 172 #
 173 # ..  tag    |index|  line  |
 174 # ..         |   row   |    |
 175 # ..         |     |   | |00| zero          (2)
 176 # ..         |     |   |-|  | INSN_BITS     (1)
 177 # ..         |     |---|    | ROW_LINE_BITS  (3)
 178 # ..         |     |--- - --| LINE_OFF_BITS (6)
 179 # ..         |         |- --| ROW_OFF_BITS  (3)
 180 # ..         |----- ---|    | ROW_BITS      (8)
 181 # ..         |-----|        | INDEX_BITS    (5)
 182 # .. --------|              | TAG_BITS      (53)
 183
 184 # The cache data BRAM organized as described above for each way
 185 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
 186 #
 187 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
 188 # not handle a clean (commented) definition of the cache tags as a 3d
 189 # memory. For now, work around it by putting all the tags
 190 def CacheTagArray():
 191     return Array(Signal(TAG_RAM_WIDTH, name="tag%d" % x) \
 192                  for x in range(NUM_LINES))
 193
 194 def CacheValidsArray():
 195     return Array(Signal(NUM_WAYS, name="tag_valids%d" % x) \
 196                  for x in range(NUM_LINES))
 197
 198 def RowPerLineValidArray():
 199     return Array(Signal(name="rows_valid_%d" %x) \
 200                  for x in range(ROW_PER_LINE))
 201
 202
 203 # TODO to be passed to nigmen as ram attributes
 204 # attribute ram_style : string;
 205 # attribute ram_style of cache_tags : signal is "distributed";
 206
 207 def TLBValidArray():
 208     return Array(Signal(name="tlb_valid%d" % x)
 209                         for x in range(TLB_SIZE))
 210
 211 def TLBRecord(name):
 212     tlb_layout = [ ('tag', TLB_EA_TAG_BITS),
 213                   ('pte', TLB_PTE_BITS)
 214                  ]
 215     return Record(tlb_layout, name=name)
 216
 217 def TLBArray():
 218     return Array(TLBRecord("tlb%d" % x) for x in range(TLB_SIZE))
 219
 220 # PLRU output interface
 221 def PLRUOut():
 222     return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
 223                  for x in range(NUM_LINES))
 224
 225 # Return the cache line index (tag index) for an address
 226 def get_index(addr):
 227     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 228
 229 # Return the cache row index (data memory) for an address
 230 def get_row(addr):
 231     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 232
 233 # Return the index of a row within a line
 234 def get_row_of_line(row):
 235     return row[:ROW_BITS][:ROW_LINE_BITS]
 236
 237 # Returns whether this is the last row of a line
 238 def is_last_row_addr(addr, last):
 239     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 240
 241 # Returns whether this is the last row of a line
 242 def is_last_row(row, last):
 243     return get_row_of_line(row) == last
 244
 245 # Return the next row in the current cache line. We use a dedicated
 246 # function in order to limit the size of the generated adder to be
 247 # only the bits within a cache line (3 bits with default settings)
 248 def next_row(row):
 249     row_v = row[0:ROW_LINE_BITS] + 1
 250     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 251
 252 # Read the instruction word for the given address
 253 # in the current cache row
 254 def read_insn_word(addr, data):
 255     word = addr[2:INSN_BITS+2]
 256     return data.word_select(word, 32)
 257
 258 # Get the tag value from the address
 259 def get_tag(addr):
 260     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 261
 262 # Read a tag from a tag memory row
 263 def read_tag(way, tagset):
 264     return tagset.word_select(way, TAG_BITS)
 265
 266 # Write a tag to tag memory row
 267 def write_tag(way, tagset, tag):
 268     return read_tag(way, tagset).eq(tag)
 269
 270 # Simple hash for direct-mapped TLB index
 271 def hash_ea(addr):
 272     hsh = (addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^
 273            addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS ] ^
 274            addr[TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS])
 275     return hsh
 276
 277
 278 # Cache reload state machine
 279 @unique
 280 class State(Enum):
 281     IDLE     = 0
 282     CLR_TAG  = 1
 283     WAIT_ACK = 2
 284
 285
 286 class RegInternal(RecordObject):
 287     def __init__(self):
 288         super().__init__()
 289         # Cache hit state (Latches for 1 cycle BRAM access)
 290         self.hit_way      = Signal(WAY_BITS)
 291         self.hit_nia      = Signal(64)
 292         self.hit_smark    = Signal()
 293         self.hit_valid    = Signal()
 294
 295         # Cache miss state (reload state machine)
 296         self.state        = Signal(State, reset=State.IDLE)
 297         self.wb           = WBMasterOut("wb")
 298         self.req_adr      = Signal(64)
 299         self.store_way    = Signal(WAY_BITS)
 300         self.store_index  = Signal(INDEX_BITS)
 301         self.store_row    = Signal(ROW_BITS)
 302         self.store_tag    = Signal(TAG_BITS)
 303         self.store_valid  = Signal()
 304         self.end_row_ix   = Signal(ROW_LINE_BITS)
 305         self.rows_valid   = RowPerLineValidArray()
 306
 307         # TLB miss state
 308         self.fetch_failed = Signal()
 309
 310
 311 class ICache(FetchUnitInterface, Elaboratable):
 312     """64 bit direct mapped icache. All instructions are 4B aligned."""
 313     def __init__(self, pspec):
 314         FetchUnitInterface.__init__(self, pspec)
 315         self.i_in           = Fetch1ToICacheType(name="i_in")
 316         self.i_out          = ICacheToDecode1Type(name="i_out")
 317
 318         self.m_in           = MMUToICacheType(name="m_in")
 319
 320         self.stall_in       = Signal()
 321         self.stall_out      = Signal()
 322         self.flush_in       = Signal()
 323         self.inval_in       = Signal()
 324
 325         # standard naming (wired to non-standard for compatibility)
 326         self.bus = Interface(addr_width=32,
 327                             data_width=64,
 328                             granularity=8,
 329                             features={'stall'},
 330                             alignment=0,
 331                             name="icache_wb")
 332
 333         self.log_out        = Signal(54)
 334
 335         # use FetchUnitInterface, helps keep some unit tests running
 336         self.use_fetch_iface = False
 337
 338     def use_fetch_interface(self):
 339         self.use_fetch_iface = True
 340
 341     # Generate a cache RAM for each way
 342     def rams(self, m, r, cache_out_row, use_previous,
 343              replace_way, req_row):
 344
 345         comb = m.d.comb
 346         sync = m.d.sync
 347
 348         bus, stall_in = self.bus, self.stall_in
 349
 350         # read condition (for every cache ram)
 351         do_read  = Signal()
 352         comb += do_read.eq(~(stall_in | use_previous))
 353
 354         rd_addr  = Signal(ROW_BITS)
 355         wr_addr  = Signal(ROW_BITS)
 356         comb += rd_addr.eq(req_row)
 357         comb += wr_addr.eq(r.store_row)
 358
 359         # binary-to-unary converters: replace-way enabled by bus.ack,
 360         # hit-way left permanently enabled
 361         m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
 362         m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
 363         comb += re.i.eq(replace_way)
 364         comb += re.n.eq(~bus.ack)
 365         comb += he.i.eq(r.hit_way)
 366
 367         for i in range(NUM_WAYS):
 368             do_write = Signal(name="do_wr_%d" % i)
 369             d_out    = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
 370             wr_sel   = Signal(ROW_SIZE, name="wr_sel_%d" % i)
 371
 372             way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
 373             m.submodules["cacheram_%d" % i] =  way
 374
 375             comb += way.rd_en.eq(do_read)
 376             comb += way.rd_addr.eq(rd_addr)
 377             comb += d_out.eq(way.rd_data_o)
 378             comb += way.wr_sel.eq(wr_sel)
 379             comb += way.wr_addr.eq(wr_addr)
 380             comb += way.wr_data.eq(bus.dat_r)
 381
 382             comb += do_write.eq(re.o[i])
 383
 384             with m.If(do_write):
 385                 sync += Display("cache write adr: %x data: %lx",
 386                                 wr_addr, way.wr_data)
 387
 388             with m.If(he.o[i]):
 389                 comb += cache_out_row.eq(d_out)
 390                 with m.If(do_read):
 391                     sync += Display("cache read adr: %x data: %x",
 392                                      req_row, d_out)
 393
 394             comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
 395
 396     # Generate PLRUs
 397     def maybe_plrus(self, m, r, plru_victim):
 398         comb = m.d.comb
 399
 400         if NUM_WAYS == 0:
 401             return
 402
 403
 404         m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
 405         comb += plru.way.eq(r.hit_way)
 406         comb += plru.valid.eq(r.hit_valid)
 407         comb += plru.index.eq(get_index(r.hit_nia))
 408         comb += plru.isel.eq(r.store_index) # select victim
 409         comb += plru_victim.eq(plru.o_index) # selected victim
 410
 411     # TLB hit detection and real address generation
 412     def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
 413                     real_addr, ra_valid, eaa_priv,
 414                     priv_fault, access_ok):
 415
 416         comb = m.d.comb
 417
 418         i_in = self.i_in
 419
 420         # use an *asynchronous* Memory read port here (combinatorial)
 421         m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
 422         tlb = TLBRecord("tlb_rdport")
 423         pte, ttag = tlb.pte, tlb.tag
 424
 425         comb += tlb_req_index.eq(hash_ea(i_in.nia))
 426         comb += rd_tlb.addr.eq(tlb_req_index)
 427         comb += tlb.eq(rd_tlb.data)
 428
 429         with m.If(i_in.virt_mode):
 430             comb += real_addr.eq(Cat(i_in.nia[:TLB_LG_PGSZ],
 431                                      pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 432
 433             with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
 434                 comb += ra_valid.eq(itlb_valid[tlb_req_index])
 435
 436             comb += eaa_priv.eq(pte[3])
 437
 438         with m.Else():
 439             comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
 440             comb += ra_valid.eq(1)
 441             comb += eaa_priv.eq(1)
 442
 443         # No IAMR, so no KUEP support for now
 444         comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
 445         comb += access_ok.eq(ra_valid & ~priv_fault)
 446
 447     # iTLB update
 448     def itlb_update(self, m, itlb, itlb_valid):
 449         comb = m.d.comb
 450         sync = m.d.sync
 451
 452         m_in = self.m_in
 453
 454         wr_index = Signal(TLB_SIZE)
 455         comb += wr_index.eq(hash_ea(m_in.addr))
 456
 457         m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
 458
 459         with m.If(m_in.tlbie & m_in.doall):
 460             # Clear all valid bits
 461             for i in range(TLB_SIZE):
 462                 sync += itlb_valid[i].eq(0)
 463
 464         with m.Elif(m_in.tlbie):
 465             # Clear entry regardless of hit or miss
 466             sync += itlb_valid[wr_index].eq(0)
 467
 468         with m.Elif(m_in.tlbld):
 469             tlb = TLBRecord("tlb_wrport")
 470             comb += tlb.tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
 471             comb += tlb.pte.eq(m_in.pte)
 472             comb += wr_tlb.en.eq(1)
 473             comb += wr_tlb.addr.eq(wr_index)
 474             comb += wr_tlb.data.eq(tlb)
 475             sync += itlb_valid[wr_index].eq(1)
 476
 477     # Cache hit detection, output to fetch2 and other misc logic
 478     def icache_comb(self, m, use_previous, r, req_index, req_row,
 479                     req_hit_way, req_tag, real_addr, req_laddr,
 480                     cache_valids, access_ok,
 481                     req_is_hit, req_is_miss, replace_way,
 482                     plru_victim, cache_out_row):
 483
 484         comb = m.d.comb
 485         m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
 486
 487         i_in, i_out, bus = self.i_in, self.i_out, self.bus
 488         flush_in, stall_out = self.flush_in, self.stall_out
 489
 490         is_hit  = Signal()
 491         hit_way = Signal(WAY_BITS)
 492
 493         # i_in.sequential means that i_in.nia this cycle is 4 more than
 494         # last cycle.  If we read more than 32 bits at a time, had a
 495         # cache hit last cycle, and we don't want the first 32-bit chunk
 496         # then we can keep the data we read last cycle and just use that.
 497         with m.If(i_in.nia[2:INSN_BITS+2] != 0):
 498             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 499
 500         # Extract line, row and tag from request
 501         comb += req_index.eq(get_index(i_in.nia))
 502         comb += req_row.eq(get_row(i_in.nia))
 503         comb += req_tag.eq(get_tag(real_addr))
 504
 505         # Calculate address of beginning of cache row, will be
 506         # used for cache miss processing if needed
 507         comb += req_laddr.eq(Cat(
 508                  Const(0, ROW_OFF_BITS),
 509                  real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
 510                 ))
 511
 512         # Test if pending request is a hit on any way
 513         hitcond = Signal()
 514         comb += hitcond.eq((r.state == State.WAIT_ACK)
 515                  & (req_index == r.store_index)
 516                  & r.rows_valid[req_row % ROW_PER_LINE]
 517                 )
 518         # i_in.req asserts Decoder active
 519         cvb = Signal(NUM_WAYS)
 520         ctag = Signal(TAG_RAM_WIDTH)
 521         comb += rd_tag.addr.eq(req_index)
 522         comb += ctag.eq(rd_tag.data)
 523         comb += cvb.eq(cache_valids[req_index])
 524         m.submodules.store_way_e = se = Decoder(NUM_WAYS)
 525         comb += se.i.eq(r.store_way)
 526         comb += se.n.eq(~i_in.req)
 527         for i in range(NUM_WAYS):
 528             tagi = Signal(TAG_BITS, name="tag_i%d" % i)
 529             hit_test = Signal(name="hit_test%d" % i)
 530             is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 531             comb += tagi.eq(read_tag(i, ctag))
 532             comb += hit_test.eq(se.o[i])
 533             comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
 534                                   (tagi == req_tag))
 535             with m.If(is_tag_hit):
 536                 comb += hit_way.eq(i)
 537                 comb += is_hit.eq(1)
 538
 539         # Generate the "hit" and "miss" signals
 540         # for the synchronous blocks
 541         with m.If(i_in.req & access_ok & ~flush_in):
 542             comb += req_is_hit.eq(is_hit)
 543             comb += req_is_miss.eq(~is_hit)
 544
 545         comb += req_hit_way.eq(hit_way)
 546
 547         # The way to replace on a miss
 548         with m.If(r.state == State.CLR_TAG):
 549             comb += replace_way.eq(plru_victim)
 550         with m.Else():
 551             comb += replace_way.eq(r.store_way)
 552
 553         # Output instruction from current cache row
 554         #
 555         # Note: This is a mild violation of our design principle of
 556         # having pipeline stages output from a clean latch. In this
 557         # case we output the result of a mux. The alternative would
 558         # be output an entire row which I prefer not to do just yet
 559         # as it would force fetch2 to know about some of the cache
 560         # geometry information.
 561         comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
 562         comb += i_out.valid.eq(r.hit_valid)
 563         comb += i_out.nia.eq(r.hit_nia)
 564         comb += i_out.stop_mark.eq(r.hit_smark)
 565         comb += i_out.fetch_failed.eq(r.fetch_failed)
 566
 567         # Stall fetch1 if we have a miss on cache or TLB
 568         # or a protection fault
 569         comb += stall_out.eq(~(is_hit & access_ok))
 570
 571         # Wishbone requests output (from the cache miss reload machine)
 572         comb += bus.we.eq(r.wb.we)
 573         comb += bus.adr.eq(r.wb.adr)
 574         comb += bus.sel.eq(r.wb.sel)
 575         comb += bus.stb.eq(r.wb.stb)
 576         comb += bus.dat_w.eq(r.wb.dat)
 577         comb += bus.cyc.eq(r.wb.cyc)
 578
 579     # Cache hit synchronous machine
 580     def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
 581                    req_index, req_tag, real_addr):
 582         sync = m.d.sync
 583
 584         i_in, stall_in = self.i_in, self.stall_in
 585         flush_in       = self.flush_in
 586
 587         # keep outputs to fetch2 unchanged on a stall
 588         # except that flush or reset sets valid to 0
 589         # If use_previous, keep the same data as last
 590         # cycle and use the second half
 591         with m.If(stall_in | use_previous):
 592             with m.If(flush_in):
 593                 sync += r.hit_valid.eq(0)
 594         with m.Else():
 595             # On a hit, latch the request for the next cycle,
 596             # when the BRAM data will be available on the
 597             # cache_out output of the corresponding way
 598             sync += r.hit_valid.eq(req_is_hit)
 599
 600             with m.If(req_is_hit):
 601                 sync += r.hit_way.eq(req_hit_way)
 602                 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
 603                                 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
 604                                  i_in.stop_mark, req_index, req_tag,
 605                                  req_hit_way, real_addr)
 606
 607         with m.If(~stall_in):
 608             # Send stop marks and NIA down regardless of validity
 609             sync += r.hit_smark.eq(i_in.stop_mark)
 610             sync += r.hit_nia.eq(i_in.nia)
 611
 612     def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
 613                          req_index, req_tag, replace_way, real_addr):
 614         comb = m.d.comb
 615         sync = m.d.sync
 616
 617         i_in = self.i_in
 618
 619         # Reset per-row valid flags, only used in WAIT_ACK
 620         for i in range(ROW_PER_LINE):
 621             sync += r.rows_valid[i].eq(0)
 622
 623         # We need to read a cache line
 624         with m.If(req_is_miss):
 625             sync += Display(
 626                      "cache miss nia:%x IR:%x SM:%x idx:%x "
 627                      " way:%x tag:%x RA:%x", i_in.nia,
 628                      i_in.virt_mode, i_in.stop_mark, req_index,
 629                      replace_way, req_tag, real_addr)
 630
 631             # Keep track of our index and way for subsequent stores
 632             st_row = Signal(ROW_BITS)
 633             comb += st_row.eq(get_row(req_laddr))
 634             sync += r.store_index.eq(req_index)
 635             sync += r.store_row.eq(st_row)
 636             sync += r.store_tag.eq(req_tag)
 637             sync += r.store_valid.eq(1)
 638             sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
 639
 640             # Prep for first wishbone read.  We calculate the address
 641             # of the start of the cache line and start the WB cycle.
 642             sync += r.req_adr.eq(req_laddr)
 643             sync += r.wb.cyc.eq(1)
 644             sync += r.wb.stb.eq(1)
 645
 646             # Track that we had one request sent
 647             sync += r.state.eq(State.CLR_TAG)
 648
 649     def icache_miss_clr_tag(self, m, r, replace_way,
 650                             req_index,
 651                             cache_valids):
 652         comb = m.d.comb
 653         sync = m.d.sync
 654         m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
 655                                                     granularity=TAG_BITS)
 656
 657         # Get victim way from plru
 658         sync += r.store_way.eq(replace_way)
 659
 660         # Force misses on that way while reloading that line
 661         cv = Signal(INDEX_BITS)
 662         comb += cv.eq(cache_valids[req_index])
 663         comb += cv.bit_select(replace_way, 1).eq(0)
 664         sync += cache_valids[req_index].eq(cv)
 665
 666         # use write-port "granularity" to select the tag to write to
 667         # TODO: the Memory should be multipled-up (by NUM_TAGS)
 668         tagset = Signal(TAG_RAM_WIDTH)
 669         comb += tagset.eq(r.store_tag << (replace_way*TAG_BITS))
 670         comb += wr_tag.en.eq(1<<replace_way)
 671         comb += wr_tag.addr.eq(r.store_index)
 672         comb += wr_tag.data.eq(tagset)
 673
 674         sync += r.state.eq(State.WAIT_ACK)
 675
 676     def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
 677                              cache_valids, stbs_done):
 678         comb = m.d.comb
 679         sync = m.d.sync
 680
 681         bus = self.bus
 682
 683         # Requests are all sent if stb is 0
 684         stbs_zero = Signal()
 685         comb += stbs_zero.eq(r.wb.stb == 0)
 686         comb += stbs_done.eq(stbs_zero)
 687
 688         # If we are still sending requests, was one accepted?
 689         with m.If(~bus.stall & ~stbs_zero):
 690             # That was the last word? We are done sending.
 691             # Clear stb and set stbs_done so we can handle
 692             # an eventual last ack on the same cycle.
 693             with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
 694                 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
 695                          "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
 696                          "stbs_done:%x", r.wb.adr, r.end_row_ix,
 697                          r.wb.stb, stbs_zero, stbs_done)
 698                 sync += r.wb.stb.eq(0)
 699                 comb += stbs_done.eq(1)
 700
 701             # Calculate the next row address
 702             rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
 703             comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
 704             sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
 705             sync += Display("RARANGE r.req_adr:%x rarange:%x "
 706                             "stbs_zero:%x stbs_done:%x",
 707                             r.req_adr, rarange, stbs_zero, stbs_done)
 708
 709         # Incoming acks processing
 710         with m.If(bus.ack):
 711             sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
 712                             "stbs_done:%x",
 713                             bus.dat_r, stbs_zero, stbs_done)
 714
 715             sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
 716
 717             # Check for completion
 718             with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
 719                 # Complete wishbone cycle
 720                 sync += r.wb.cyc.eq(0)
 721                 # be nice, clear addr
 722                 sync += r.req_adr.eq(0)
 723
 724                 # Cache line is now valid
 725                 cv = Signal(INDEX_BITS)
 726                 comb += cv.eq(cache_valids[r.store_index])
 727                 comb += cv.bit_select(replace_way, 1).eq(
 728                          r.store_valid & ~inval_in)
 729                 sync += cache_valids[r.store_index].eq(cv)
 730
 731                 sync += r.state.eq(State.IDLE)
 732
 733             # move on to next request in row
 734             # Increment store row counter
 735             sync += r.store_row.eq(next_row(r.store_row))
 736
 737     # Cache miss/reload synchronous machine
 738     def icache_miss(self, m, r, req_is_miss,
 739                     req_index, req_laddr, req_tag, replace_way,
 740                     cache_valids, access_ok, real_addr):
 741         comb = m.d.comb
 742         sync = m.d.sync
 743
 744         i_in, bus, m_in  = self.i_in, self.bus, self.m_in
 745         stall_in, flush_in = self.stall_in, self.flush_in
 746         inval_in           = self.inval_in
 747
 748         stbs_done = Signal()
 749
 750         comb += r.wb.sel.eq(-1)
 751         comb += r.wb.adr.eq(r.req_adr[3:])
 752
 753         # Process cache invalidations
 754         with m.If(inval_in):
 755             for i in range(NUM_LINES):
 756                 sync += cache_valids[i].eq(0)
 757             sync += r.store_valid.eq(0)
 758
 759         # Main state machine
 760         with m.Switch(r.state):
 761
 762             with m.Case(State.IDLE):
 763                 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
 764                                       req_index, req_tag, replace_way,
 765                                       real_addr)
 766
 767             with m.Case(State.CLR_TAG, State.WAIT_ACK):
 768                 with m.If(r.state == State.CLR_TAG):
 769                     self.icache_miss_clr_tag(m, r, replace_way,
 770                                              req_index,
 771                                              cache_valids)
 772
 773                 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
 774                                           cache_valids, stbs_done)
 775
 776         # TLB miss and protection fault processing
 777         with m.If(flush_in | m_in.tlbld):
 778             sync += r.fetch_failed.eq(0)
 779         with m.Elif(i_in.req & ~access_ok & ~stall_in):
 780             sync += r.fetch_failed.eq(1)
 781
 782     # icache_log: if LOG_LENGTH > 0 generate
 783     def icache_log(self, m, req_hit_way, ra_valid, access_ok,
 784                    req_is_miss, req_is_hit, lway, wstate, r):
 785         comb = m.d.comb
 786         sync = m.d.sync
 787
 788         bus, i_out       = self.bus, self.i_out
 789         log_out, stall_out = self.log_out, self.stall_out
 790
 791         # Output data to logger
 792         for i in range(LOG_LENGTH):
 793             log_data = Signal(54)
 794             lway     = Signal(WAY_BITS)
 795             wstate   = Signal()
 796
 797             sync += lway.eq(req_hit_way)
 798             sync += wstate.eq(0)
 799
 800             with m.If(r.state != State.IDLE):
 801                 sync += wstate.eq(1)
 802
 803             sync += log_data.eq(Cat(
 804                      ra_valid, access_ok, req_is_miss, req_is_hit,
 805                      lway, wstate, r.hit_nia[2:6], r.fetch_failed,
 806                      stall_out, bus.stall, r.wb.cyc, r.wb.stb,
 807                      r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
 808                     ))
 809             comb += log_out.eq(log_data)
 810
 811     def elaborate(self, platform):
 812
 813         m                = Module()
 814         comb             = m.d.comb
 815
 816         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
 817         cache_valids     = CacheValidsArray()
 818
 819         # TLB Array
 820         itlb            = TLBArray()
 821         itlb_valid      = TLBValidArray()
 822
 823         # TODO to be passed to nmigen as ram attributes
 824         # attribute ram_style of itlb_tags : signal is "distributed";
 825         # attribute ram_style of itlb_ptes : signal is "distributed";
 826
 827         # Privilege bit from PTE EAA field
 828         eaa_priv         = Signal()
 829
 830         r                = RegInternal()
 831
 832         # Async signal on incoming request
 833         req_index        = Signal(INDEX_BITS)
 834         req_row          = Signal(ROW_BITS)
 835         req_hit_way      = Signal(WAY_BITS)
 836         req_tag          = Signal(TAG_BITS)
 837         req_is_hit       = Signal()
 838         req_is_miss      = Signal()
 839         req_laddr        = Signal(64)
 840
 841         tlb_req_index    = Signal(TLB_BITS)
 842         real_addr        = Signal(REAL_ADDR_BITS)
 843         ra_valid         = Signal()
 844         priv_fault       = Signal()
 845         access_ok        = Signal()
 846         use_previous     = Signal()
 847
 848         cache_out_row    = Signal(ROW_SIZE_BITS)
 849
 850         plru_victim      = Signal(WAY_BITS)
 851         replace_way      = Signal(WAY_BITS)
 852
 853         self.tlbmem = Memory(depth=TLB_SIZE, width=TLB_EA_TAG_BITS+TLB_PTE_BITS)
 854         self.tagmem = Memory(depth=NUM_LINES, width=TAG_RAM_WIDTH)
 855
 856         # call sub-functions putting everything together,
 857         # using shared signals established above
 858         self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
 859         self.maybe_plrus(m, r, plru_victim)
 860         self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
 861                          ra_valid, eaa_priv, priv_fault,
 862                          access_ok)
 863         self.itlb_update(m, itlb, itlb_valid)
 864         self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
 865                          req_tag, real_addr, req_laddr,
 866                          cache_valids,
 867                          access_ok, req_is_hit, req_is_miss,
 868                          replace_way, plru_victim, cache_out_row)
 869         self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
 870                         req_index, req_tag, real_addr)
 871         self.icache_miss(m, r, req_is_miss, req_index,
 872                          req_laddr, req_tag, replace_way,
 873                          cache_valids,
 874                          access_ok, real_addr)
 875         #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
 876         #                req_is_miss, req_is_hit, lway, wstate, r)
 877
 878         # don't connect up to FetchUnitInterface so that some unit tests
 879         # can continue to operate
 880         if not self.use_fetch_iface:
 881             return m
 882
 883         # connect to FetchUnitInterface. FetchUnitInterface is undocumented
 884         # so needs checking and iterative revising
 885         i_in, bus, i_out = self.i_in, self.bus, self.i_out
 886         comb += i_in.req.eq(self.a_i_valid)
 887         comb += i_in.nia.eq(self.a_pc_i)
 888         comb += self.stall_in.eq(self.a_stall_i)
 889         comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
 890         comb += self.f_badaddr_o.eq(i_out.nia)
 891         comb += self.f_instr_o.eq(i_out.insn)
 892         comb += self.f_busy_o.eq(~i_out.valid) # probably
 893
 894         # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
 895         ibus = self.ibus
 896         comb += ibus.adr.eq(self.bus.adr)
 897         comb += ibus.dat_w.eq(self.bus.dat_w)
 898         comb += ibus.sel.eq(self.bus.sel)
 899         comb += ibus.cyc.eq(self.bus.cyc)
 900         comb += ibus.stb.eq(self.bus.stb)
 901         comb += ibus.we.eq(self.bus.we)
 902
 903         comb += self.bus.dat_r.eq(ibus.dat_r)
 904         comb += self.bus.ack.eq(ibus.ack)
 905         if hasattr(ibus, "stall"):
 906             comb += self.bus.stall.eq(ibus.stall)
 907         else:
 908             # fake-up the wishbone stall signal to comply with pipeline mode
 909             # same thing is done in dcache.py
 910             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
 911
 912         return m
 913
 914
 915 def icache_sim(dut):
 916     i_in = dut.i_in
 917     i_out  = dut.i_out
 918     m_out = dut.m_in
 919
 920     yield i_in.priv_mode.eq(1)
 921     yield i_in.req.eq(0)
 922     yield i_in.nia.eq(0)
 923     yield i_in.stop_mark.eq(0)
 924     yield m_out.tlbld.eq(0)
 925     yield m_out.tlbie.eq(0)
 926     yield m_out.addr.eq(0)
 927     yield m_out.pte.eq(0)
 928     yield
 929     yield
 930     yield
 931     yield
 932
 933     # miss, stalls for a bit
 934     yield i_in.req.eq(1)
 935     yield i_in.nia.eq(Const(0x0000000000000004, 64))
 936     yield
 937     valid = yield i_out.valid
 938     while not valid:
 939         yield
 940         valid = yield i_out.valid
 941     yield i_in.req.eq(0)
 942
 943     insn  = yield i_out.insn
 944     nia   = yield i_out.nia
 945     assert insn == 0x00000001, \
 946         "insn @%x=%x expected 00000001" % (nia, insn)
 947     yield i_in.req.eq(0)
 948     yield
 949
 950     # hit
 951     yield i_in.req.eq(1)
 952     yield i_in.nia.eq(Const(0x0000000000000008, 64))
 953     yield
 954     valid = yield i_out.valid
 955     while not valid:
 956         yield
 957         valid = yield i_out.valid
 958     yield i_in.req.eq(0)
 959
 960     nia   = yield i_out.nia
 961     insn  = yield i_out.insn
 962     yield
 963     assert insn == 0x00000002, \
 964         "insn @%x=%x expected 00000002" % (nia, insn)
 965
 966     # another miss
 967     yield i_in.req.eq(1)
 968     yield i_in.nia.eq(Const(0x0000000000000040, 64))
 969     yield
 970     valid = yield i_out.valid
 971     while not valid:
 972         yield
 973         valid = yield i_out.valid
 974     yield i_in.req.eq(0)
 975
 976     nia   = yield i_in.nia
 977     insn  = yield i_out.insn
 978     assert insn == 0x00000010, \
 979         "insn @%x=%x expected 00000010" % (nia, insn)
 980
 981     # test something that aliases (this only works because
 982     # the unit test SRAM is a depth of 512)
 983     yield i_in.req.eq(1)
 984     yield i_in.nia.eq(Const(0x0000000000000100, 64))
 985     yield
 986     yield
 987     valid = yield i_out.valid
 988     assert ~valid
 989     for i in range(30):
 990         yield
 991     yield
 992     insn  = yield i_out.insn
 993     valid = yield i_out.valid
 994     insn  = yield i_out.insn
 995     assert valid
 996     assert insn == 0x00000040, \
 997          "insn @%x=%x expected 00000040" % (nia, insn)
 998     yield i_in.req.eq(0)
 999
1000
1001 def test_icache(mem):
1002     from soc.config.test.test_loadstore import TestMemPspec
1003     pspec = TestMemPspec(addr_wid=32,
1004                          mask_wid=8,
1005                          reg_wid=64,
1006                          )
1007     dut    = ICache(pspec)
1008
1009     memory = Memory(width=64, depth=512, init=mem)
1010     sram   = SRAM(memory=memory, granularity=8)
1011
1012     m      = Module()
1013
1014     m.submodules.icache = dut
1015     m.submodules.sram   = sram
1016
1017     m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
1018     m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1019     m.d.comb += sram.bus.we.eq(dut.bus.we)
1020     m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1021     m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1022     m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1023
1024     m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1025     m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1026
1027     # nmigen Simulation
1028     sim = Simulator(m)
1029     sim.add_clock(1e-6)
1030
1031     sim.add_sync_process(wrap(icache_sim(dut)))
1032     with sim.write_vcd('test_icache.vcd'):
1033          sim.run()
1034
1035
1036 if __name__ == '__main__':
1037     from soc.config.test.test_loadstore import TestMemPspec
1038     pspec = TestMemPspec(addr_wid=64,
1039                          mask_wid=8,
1040                          reg_wid=64,
1041                          )
1042     dut = ICache(pspec)
1043     vl = rtlil.convert(dut, ports=[])
1044     with open("test_icache.il", "w") as f:
1045         f.write(vl)
1046
1047     # set up memory every 32-bits with incrementing values 0 1 2 ...
1048     mem = []
1049     for i in range(512):
1050         mem.append((i*2) | ((i*2+1)<<32))
1051
1052     test_icache(mem)