src/soc/experiment/icache.py

   1 """ICache
   2
   3 based on Anton Blanchard microwatt icache.vhdl
   4
   5 Set associative icache
   6
   7 TODO (in no specific order):
   8 * Add debug interface to inspect cache content
   9 * Add snoop/invalidate path
  10 * Add multi-hit error detection
  11 * Pipelined bus interface (wb or axi)
  12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
  13 * Add optimization: service hits on partially loaded lines
  14 * Add optimization: (maybe) interrupt reload on fluch/redirect
  15 * Check if playing with the geometry of the cache tags allow for more
  16   efficient use of distributed RAM and less logic/muxes. Currently we
  17   write TAG_BITS width which may not match full ram blocks and might
  18   cause muxes to be inferred for "partial writes".
  19 * Check if making the read size of PLRU a ROM helps utilization
  20
  21 Links:
  22
  23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
  24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  25   (discussion about brams for ECP5)
  26
  27 """
  28
  29 from enum import (Enum, unique)
  30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
  31                     Record)
  32 from nmigen.cli import main, rtlil
  33 from nmutil.iocontrol import RecordObject
  34 from nmigen.utils import log2_int
  35 from nmigen.lib.coding import Decoder
  36 from nmutil.util import Display
  37
  38 #from nmutil.plru import PLRU
  39 from soc.experiment.plru import PLRU, PLRUs
  40 from soc.experiment.cache_ram import CacheRam
  41
  42 from soc.experiment.mem_types import (Fetch1ToICacheType,
  43                                       ICacheToDecode1Type,
  44                                       MMUToICacheType)
  45
  46 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
  47                                      WB_SEL_BITS, WBAddrType, WBDataType,
  48                                      WBSelType, WBMasterOut, WBSlaveOut,
  49                                      )
  50
  51 from nmigen_soc.wishbone.bus import Interface
  52 from soc.minerva.units.fetch import FetchUnitInterface
  53
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmutil.util import wrap
  59 from nmigen.cli import main, rtlil
  60
  61 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  62 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  63 from nmutil.sim_tmp_alternative import Simulator, Settle
  64
  65
  66 SIM            = 0
  67 LINE_SIZE      = 64
  68 # BRAM organisation: We never access more than wishbone_data_bits
  69 # at a time so to save resources we make the array only that wide,
  70 # and use consecutive indices for to make a cache "line"
  71 #
  72 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
  73 ROW_SIZE       = WB_DATA_BITS // 8
  74 # Number of lines in a set
  75 NUM_LINES      = 64
  76 # Number of ways
  77 NUM_WAYS       = 2
  78 # L1 ITLB number of entries (direct mapped)
  79 TLB_SIZE       = 64
  80 # L1 ITLB log_2(page_size)
  81 TLB_LG_PGSZ    = 12
  82 # Number of real address bits that we store
  83 REAL_ADDR_BITS = 56
  84 # Non-zero to enable log data collection
  85 LOG_LENGTH     = 0
  86
  87 ROW_SIZE_BITS  = ROW_SIZE * 8
  88 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
  89 ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
  90 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
  91 BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
  92 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
  93 INSN_PER_ROW   = ROW_SIZE_BITS // 32
  94
  95 # Bit fields counts in the address
  96 #
  97 # INSN_BITS is the number of bits to select an instruction in a row
  98 INSN_BITS      = log2_int(INSN_PER_ROW)
  99 # ROW_BITS is the number of bits to select a row
 100 ROW_BITS       = log2_int(BRAM_ROWS)
 101 # ROW_LINE_BITS is the number of bits to select a row within a line
 102 ROW_LINE_BITS  = log2_int(ROW_PER_LINE)
 103 # LINE_OFF_BITS is the number of bits for the offset in a cache line
 104 LINE_OFF_BITS  = log2_int(LINE_SIZE)
 105 # ROW_OFF_BITS is the number of bits for the offset in a row
 106 ROW_OFF_BITS   = log2_int(ROW_SIZE)
 107 # INDEX_BITS is the number of bits to select a cache line
 108 INDEX_BITS     = log2_int(NUM_LINES)
 109 # SET_SIZE_BITS is the log base 2 of the set size
 110 SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
 111 # TAG_BITS is the number of bits of the tag part of the address
 112 TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
 113 # TAG_WIDTH is the width in bits of each way of the tag RAM
 114 TAG_WIDTH      = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 115
 116 # WAY_BITS is the number of bits to select a way
 117 WAY_BITS       = log2_int(NUM_WAYS)
 118 TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
 119
 120 # L1 ITLB
 121 TLB_BITS        = log2_int(TLB_SIZE)
 122 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
 123 TLB_PTE_BITS    = 64
 124
 125 print("BRAM_ROWS       =", BRAM_ROWS)
 126 print("INDEX_BITS      =", INDEX_BITS)
 127 print("INSN_BITS       =", INSN_BITS)
 128 print("INSN_PER_ROW    =", INSN_PER_ROW)
 129 print("LINE_SIZE       =", LINE_SIZE)
 130 print("LINE_OFF_BITS   =", LINE_OFF_BITS)
 131 print("LOG_LENGTH      =", LOG_LENGTH)
 132 print("NUM_LINES       =", NUM_LINES)
 133 print("NUM_WAYS        =", NUM_WAYS)
 134 print("REAL_ADDR_BITS  =", REAL_ADDR_BITS)
 135 print("ROW_BITS        =", ROW_BITS)
 136 print("ROW_OFF_BITS    =", ROW_OFF_BITS)
 137 print("ROW_LINE_BITS   =", ROW_LINE_BITS)
 138 print("ROW_PER_LINE    =", ROW_PER_LINE)
 139 print("ROW_SIZE        =", ROW_SIZE)
 140 print("ROW_SIZE_BITS   =", ROW_SIZE_BITS)
 141 print("SET_SIZE_BITS   =", SET_SIZE_BITS)
 142 print("SIM             =", SIM)
 143 print("TAG_BITS        =", TAG_BITS)
 144 print("TAG_RAM_WIDTH   =", TAG_RAM_WIDTH)
 145 print("TAG_BITS        =", TAG_BITS)
 146 print("TLB_BITS        =", TLB_BITS)
 147 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
 148 print("TLB_LG_PGSZ     =", TLB_LG_PGSZ)
 149 print("TLB_PTE_BITS    =", TLB_PTE_BITS)
 150 print("TLB_SIZE        =", TLB_SIZE)
 151 print("WAY_BITS        =", WAY_BITS)
 152
 153 # from microwatt/utils.vhdl
 154 def ispow2(n):
 155     return n != 0 and (n & (n - 1)) == 0
 156
 157 assert LINE_SIZE % ROW_SIZE == 0
 158 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 159 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 160 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 161 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
 162 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
 163     "geometry bits don't add up"
 164 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
 165    "geometry bits don't add up"
 166 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
 167     "geometry bits don't add up"
 168 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
 169     "geometry bits don't add up"
 170
 171 # Example of layout for 32 lines of 64 bytes:
 172 #
 173 # ..  tag    |index|  line  |
 174 # ..         |   row   |    |
 175 # ..         |     |   | |00| zero          (2)
 176 # ..         |     |   |-|  | INSN_BITS     (1)
 177 # ..         |     |---|    | ROW_LINE_BITS  (3)
 178 # ..         |     |--- - --| LINE_OFF_BITS (6)
 179 # ..         |         |- --| ROW_OFF_BITS  (3)
 180 # ..         |----- ---|    | ROW_BITS      (8)
 181 # ..         |-----|        | INDEX_BITS    (5)
 182 # .. --------|              | TAG_BITS      (53)
 183
 184 # The cache data BRAM organized as described above for each way
 185 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
 186 #
 187 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
 188 # not handle a clean (commented) definition of the cache tags as a 3d
 189 # memory. For now, work around it by putting all the tags
 190 def CacheValidsArray():
 191     return Array(Signal(NUM_WAYS, name="tag_valids%d" % x) \
 192                  for x in range(NUM_LINES))
 193
 194 def RowPerLineValidArray():
 195     return Array(Signal(name="rows_valid_%d" %x) \
 196                  for x in range(ROW_PER_LINE))
 197
 198
 199 # TODO to be passed to nigmen as ram attributes
 200 # attribute ram_style : string;
 201 # attribute ram_style of cache_tags : signal is "distributed";
 202
 203 def TLBValidArray():
 204     return Array(Signal(name="tlb_valid%d" % x)
 205                         for x in range(TLB_SIZE))
 206
 207 def TLBRecord(name):
 208     tlb_layout = [ ('tag', TLB_EA_TAG_BITS),
 209                   ('pte', TLB_PTE_BITS)
 210                  ]
 211     return Record(tlb_layout, name=name)
 212
 213 def TLBArray():
 214     return Array(TLBRecord("tlb%d" % x) for x in range(TLB_SIZE))
 215
 216 # PLRU output interface
 217 def PLRUOut():
 218     return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
 219                  for x in range(NUM_LINES))
 220
 221 # Return the cache line index (tag index) for an address
 222 def get_index(addr):
 223     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 224
 225 # Return the cache row index (data memory) for an address
 226 def get_row(addr):
 227     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 228
 229 # Return the index of a row within a line
 230 def get_row_of_line(row):
 231     return row[:ROW_BITS][:ROW_LINE_BITS]
 232
 233 # Returns whether this is the last row of a line
 234 def is_last_row_addr(addr, last):
 235     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 236
 237 # Returns whether this is the last row of a line
 238 def is_last_row(row, last):
 239     return get_row_of_line(row) == last
 240
 241 # Return the next row in the current cache line. We use a dedicated
 242 # function in order to limit the size of the generated adder to be
 243 # only the bits within a cache line (3 bits with default settings)
 244 def next_row(row):
 245     row_v = row[0:ROW_LINE_BITS] + 1
 246     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 247
 248 # Read the instruction word for the given address
 249 # in the current cache row
 250 def read_insn_word(addr, data):
 251     word = addr[2:INSN_BITS+2]
 252     return data.word_select(word, 32)
 253
 254 # Get the tag value from the address
 255 def get_tag(addr):
 256     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 257
 258 # Read a tag from a tag memory row
 259 def read_tag(way, tagset):
 260     return tagset.word_select(way, TAG_BITS)
 261
 262 # Write a tag to tag memory row
 263 def write_tag(way, tagset, tag):
 264     return read_tag(way, tagset).eq(tag)
 265
 266 # Simple hash for direct-mapped TLB index
 267 def hash_ea(addr):
 268     hsh = (addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^
 269            addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS ] ^
 270            addr[TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS])
 271     return hsh
 272
 273
 274 # Cache reload state machine
 275 @unique
 276 class State(Enum):
 277     IDLE     = 0
 278     CLR_TAG  = 1
 279     WAIT_ACK = 2
 280
 281
 282 class RegInternal(RecordObject):
 283     def __init__(self):
 284         super().__init__()
 285         # Cache hit state (Latches for 1 cycle BRAM access)
 286         self.hit_way      = Signal(WAY_BITS)
 287         self.hit_nia      = Signal(64)
 288         self.hit_smark    = Signal()
 289         self.hit_valid    = Signal()
 290
 291         # Cache miss state (reload state machine)
 292         self.state        = Signal(State, reset=State.IDLE)
 293         self.wb           = WBMasterOut("wb")
 294         self.req_adr      = Signal(64)
 295         self.store_way    = Signal(WAY_BITS)
 296         self.store_index  = Signal(INDEX_BITS)
 297         self.store_row    = Signal(ROW_BITS)
 298         self.store_tag    = Signal(TAG_BITS)
 299         self.store_valid  = Signal()
 300         self.end_row_ix   = Signal(ROW_LINE_BITS)
 301         self.rows_valid   = RowPerLineValidArray()
 302
 303         # TLB miss state
 304         self.fetch_failed = Signal()
 305
 306
 307 class ICache(FetchUnitInterface, Elaboratable):
 308     """64 bit direct mapped icache. All instructions are 4B aligned."""
 309     def __init__(self, pspec):
 310         FetchUnitInterface.__init__(self, pspec)
 311         self.i_in           = Fetch1ToICacheType(name="i_in")
 312         self.i_out          = ICacheToDecode1Type(name="i_out")
 313
 314         self.m_in           = MMUToICacheType(name="m_in")
 315
 316         self.stall_in       = Signal()
 317         self.stall_out      = Signal()
 318         self.flush_in       = Signal()
 319         self.inval_in       = Signal()
 320
 321         # standard naming (wired to non-standard for compatibility)
 322         self.bus = Interface(addr_width=32,
 323                             data_width=64,
 324                             granularity=8,
 325                             features={'stall'},
 326                             alignment=0,
 327                             name="icache_wb")
 328
 329         self.log_out        = Signal(54)
 330
 331         # use FetchUnitInterface, helps keep some unit tests running
 332         self.use_fetch_iface = False
 333
 334     def use_fetch_interface(self):
 335         self.use_fetch_iface = True
 336
 337     # Generate a cache RAM for each way
 338     def rams(self, m, r, cache_out_row, use_previous,
 339              replace_way, req_row):
 340
 341         comb = m.d.comb
 342         sync = m.d.sync
 343
 344         bus, stall_in = self.bus, self.stall_in
 345
 346         # read condition (for every cache ram)
 347         do_read  = Signal()
 348         comb += do_read.eq(~(stall_in | use_previous))
 349
 350         rd_addr  = Signal(ROW_BITS)
 351         wr_addr  = Signal(ROW_BITS)
 352         comb += rd_addr.eq(req_row)
 353         comb += wr_addr.eq(r.store_row)
 354
 355         # binary-to-unary converters: replace-way enabled by bus.ack,
 356         # hit-way left permanently enabled
 357         m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
 358         m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
 359         comb += re.i.eq(replace_way)
 360         comb += re.n.eq(~bus.ack)
 361         comb += he.i.eq(r.hit_way)
 362
 363         for i in range(NUM_WAYS):
 364             do_write = Signal(name="do_wr_%d" % i)
 365             d_out    = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
 366             wr_sel   = Signal(ROW_SIZE, name="wr_sel_%d" % i)
 367
 368             way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
 369             m.submodules["cacheram_%d" % i] =  way
 370
 371             comb += way.rd_en.eq(do_read)
 372             comb += way.rd_addr.eq(rd_addr)
 373             comb += d_out.eq(way.rd_data_o)
 374             comb += way.wr_sel.eq(wr_sel)
 375             comb += way.wr_addr.eq(wr_addr)
 376             comb += way.wr_data.eq(bus.dat_r)
 377
 378             comb += do_write.eq(re.o[i])
 379
 380             with m.If(do_write):
 381                 sync += Display("cache write adr: %x data: %lx",
 382                                 wr_addr, way.wr_data)
 383
 384             with m.If(he.o[i]):
 385                 comb += cache_out_row.eq(d_out)
 386                 with m.If(do_read):
 387                     sync += Display("cache read adr: %x data: %x",
 388                                      req_row, d_out)
 389
 390             comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
 391
 392     # Generate PLRUs
 393     def maybe_plrus(self, m, r, plru_victim):
 394         comb = m.d.comb
 395
 396         if NUM_WAYS == 0:
 397             return
 398
 399
 400         m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
 401         comb += plru.way.eq(r.hit_way)
 402         comb += plru.valid.eq(r.hit_valid)
 403         comb += plru.index.eq(get_index(r.hit_nia))
 404         comb += plru.isel.eq(r.store_index) # select victim
 405         comb += plru_victim.eq(plru.o_index) # selected victim
 406
 407     # TLB hit detection and real address generation
 408     def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
 409                     real_addr, ra_valid, eaa_priv,
 410                     priv_fault, access_ok):
 411
 412         comb = m.d.comb
 413
 414         i_in = self.i_in
 415
 416         # use an *asynchronous* Memory read port here (combinatorial)
 417         m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
 418         tlb = TLBRecord("tlb_rdport")
 419         pte, ttag = tlb.pte, tlb.tag
 420
 421         comb += tlb_req_index.eq(hash_ea(i_in.nia))
 422         comb += rd_tlb.addr.eq(tlb_req_index)
 423         comb += tlb.eq(rd_tlb.data)
 424
 425         with m.If(i_in.virt_mode):
 426             comb += real_addr.eq(Cat(i_in.nia[:TLB_LG_PGSZ],
 427                                      pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 428
 429             with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
 430                 comb += ra_valid.eq(itlb_valid[tlb_req_index])
 431
 432             comb += eaa_priv.eq(pte[3])
 433
 434         with m.Else():
 435             comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
 436             comb += ra_valid.eq(1)
 437             comb += eaa_priv.eq(1)
 438
 439         # No IAMR, so no KUEP support for now
 440         comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
 441         comb += access_ok.eq(ra_valid & ~priv_fault)
 442
 443     # iTLB update
 444     def itlb_update(self, m, itlb, itlb_valid):
 445         comb = m.d.comb
 446         sync = m.d.sync
 447
 448         m_in = self.m_in
 449
 450         wr_index = Signal(TLB_SIZE)
 451         comb += wr_index.eq(hash_ea(m_in.addr))
 452
 453         m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
 454
 455         with m.If(m_in.tlbie & m_in.doall):
 456             # Clear all valid bits
 457             for i in range(TLB_SIZE):
 458                 sync += itlb_valid[i].eq(0)
 459
 460         with m.Elif(m_in.tlbie):
 461             # Clear entry regardless of hit or miss
 462             sync += itlb_valid[wr_index].eq(0)
 463
 464         with m.Elif(m_in.tlbld):
 465             tlb = TLBRecord("tlb_wrport")
 466             comb += tlb.tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
 467             comb += tlb.pte.eq(m_in.pte)
 468             comb += wr_tlb.en.eq(1)
 469             comb += wr_tlb.addr.eq(wr_index)
 470             comb += wr_tlb.data.eq(tlb)
 471             sync += itlb_valid[wr_index].eq(1)
 472
 473     # Cache hit detection, output to fetch2 and other misc logic
 474     def icache_comb(self, m, use_previous, r, req_index, req_row,
 475                     req_hit_way, req_tag, real_addr, req_laddr,
 476                     cache_valids, access_ok,
 477                     req_is_hit, req_is_miss, replace_way,
 478                     plru_victim, cache_out_row):
 479
 480         comb = m.d.comb
 481         m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
 482
 483         i_in, i_out, bus = self.i_in, self.i_out, self.bus
 484         flush_in, stall_out = self.flush_in, self.stall_out
 485
 486         is_hit  = Signal()
 487         hit_way = Signal(WAY_BITS)
 488
 489         # i_in.sequential means that i_in.nia this cycle is 4 more than
 490         # last cycle.  If we read more than 32 bits at a time, had a
 491         # cache hit last cycle, and we don't want the first 32-bit chunk
 492         # then we can keep the data we read last cycle and just use that.
 493         with m.If(i_in.nia[2:INSN_BITS+2] != 0):
 494             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 495
 496         # Extract line, row and tag from request
 497         comb += req_index.eq(get_index(i_in.nia))
 498         comb += req_row.eq(get_row(i_in.nia))
 499         comb += req_tag.eq(get_tag(real_addr))
 500
 501         # Calculate address of beginning of cache row, will be
 502         # used for cache miss processing if needed
 503         comb += req_laddr.eq(Cat(
 504                  Const(0, ROW_OFF_BITS),
 505                  real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
 506                 ))
 507
 508         # Test if pending request is a hit on any way
 509         hitcond = Signal()
 510         comb += hitcond.eq((r.state == State.WAIT_ACK)
 511                  & (req_index == r.store_index)
 512                  & r.rows_valid[req_row % ROW_PER_LINE]
 513                 )
 514         # i_in.req asserts Decoder active
 515         cvb = Signal(NUM_WAYS)
 516         ctag = Signal(TAG_RAM_WIDTH)
 517         comb += rd_tag.addr.eq(req_index)
 518         comb += ctag.eq(rd_tag.data)
 519         comb += cvb.eq(cache_valids[req_index])
 520         m.submodules.store_way_e = se = Decoder(NUM_WAYS)
 521         comb += se.i.eq(r.store_way)
 522         comb += se.n.eq(~i_in.req)
 523         for i in range(NUM_WAYS):
 524             tagi = Signal(TAG_BITS, name="tag_i%d" % i)
 525             hit_test = Signal(name="hit_test%d" % i)
 526             is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 527             comb += tagi.eq(read_tag(i, ctag))
 528             comb += hit_test.eq(se.o[i])
 529             comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
 530                                   (tagi == req_tag))
 531             with m.If(is_tag_hit):
 532                 comb += hit_way.eq(i)
 533                 comb += is_hit.eq(1)
 534
 535         # Generate the "hit" and "miss" signals
 536         # for the synchronous blocks
 537         with m.If(i_in.req & access_ok & ~flush_in):
 538             comb += req_is_hit.eq(is_hit)
 539             comb += req_is_miss.eq(~is_hit)
 540
 541         comb += req_hit_way.eq(hit_way)
 542
 543         # The way to replace on a miss
 544         with m.If(r.state == State.CLR_TAG):
 545             comb += replace_way.eq(plru_victim)
 546         with m.Else():
 547             comb += replace_way.eq(r.store_way)
 548
 549         # Output instruction from current cache row
 550         #
 551         # Note: This is a mild violation of our design principle of
 552         # having pipeline stages output from a clean latch. In this
 553         # case we output the result of a mux. The alternative would
 554         # be output an entire row which I prefer not to do just yet
 555         # as it would force fetch2 to know about some of the cache
 556         # geometry information.
 557         comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
 558         comb += i_out.valid.eq(r.hit_valid)
 559         comb += i_out.nia.eq(r.hit_nia)
 560         comb += i_out.stop_mark.eq(r.hit_smark)
 561         comb += i_out.fetch_failed.eq(r.fetch_failed)
 562
 563         # Stall fetch1 if we have a miss on cache or TLB
 564         # or a protection fault
 565         comb += stall_out.eq(~(is_hit & access_ok))
 566
 567         # Wishbone requests output (from the cache miss reload machine)
 568         comb += bus.we.eq(r.wb.we)
 569         comb += bus.adr.eq(r.wb.adr)
 570         comb += bus.sel.eq(r.wb.sel)
 571         comb += bus.stb.eq(r.wb.stb)
 572         comb += bus.dat_w.eq(r.wb.dat)
 573         comb += bus.cyc.eq(r.wb.cyc)
 574
 575     # Cache hit synchronous machine
 576     def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
 577                    req_index, req_tag, real_addr):
 578         sync = m.d.sync
 579
 580         i_in, stall_in = self.i_in, self.stall_in
 581         flush_in       = self.flush_in
 582
 583         # keep outputs to fetch2 unchanged on a stall
 584         # except that flush or reset sets valid to 0
 585         # If use_previous, keep the same data as last
 586         # cycle and use the second half
 587         with m.If(stall_in | use_previous):
 588             with m.If(flush_in):
 589                 sync += r.hit_valid.eq(0)
 590         with m.Else():
 591             # On a hit, latch the request for the next cycle,
 592             # when the BRAM data will be available on the
 593             # cache_out output of the corresponding way
 594             sync += r.hit_valid.eq(req_is_hit)
 595
 596             with m.If(req_is_hit):
 597                 sync += r.hit_way.eq(req_hit_way)
 598                 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
 599                                 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
 600                                  i_in.stop_mark, req_index, req_tag,
 601                                  req_hit_way, real_addr)
 602
 603         with m.If(~stall_in):
 604             # Send stop marks and NIA down regardless of validity
 605             sync += r.hit_smark.eq(i_in.stop_mark)
 606             sync += r.hit_nia.eq(i_in.nia)
 607
 608     def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
 609                          req_index, req_tag, replace_way, real_addr):
 610         comb = m.d.comb
 611         sync = m.d.sync
 612
 613         i_in = self.i_in
 614
 615         # Reset per-row valid flags, only used in WAIT_ACK
 616         for i in range(ROW_PER_LINE):
 617             sync += r.rows_valid[i].eq(0)
 618
 619         # We need to read a cache line
 620         with m.If(req_is_miss):
 621             sync += Display(
 622                      "cache miss nia:%x IR:%x SM:%x idx:%x "
 623                      " way:%x tag:%x RA:%x", i_in.nia,
 624                      i_in.virt_mode, i_in.stop_mark, req_index,
 625                      replace_way, req_tag, real_addr)
 626
 627             # Keep track of our index and way for subsequent stores
 628             st_row = Signal(ROW_BITS)
 629             comb += st_row.eq(get_row(req_laddr))
 630             sync += r.store_index.eq(req_index)
 631             sync += r.store_row.eq(st_row)
 632             sync += r.store_tag.eq(req_tag)
 633             sync += r.store_valid.eq(1)
 634             sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
 635
 636             # Prep for first wishbone read.  We calculate the address
 637             # of the start of the cache line and start the WB cycle.
 638             sync += r.req_adr.eq(req_laddr)
 639             sync += r.wb.cyc.eq(1)
 640             sync += r.wb.stb.eq(1)
 641
 642             # Track that we had one request sent
 643             sync += r.state.eq(State.CLR_TAG)
 644
 645     def icache_miss_clr_tag(self, m, r, replace_way,
 646                             req_index,
 647                             cache_valids):
 648         comb = m.d.comb
 649         sync = m.d.sync
 650         m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
 651                                                     granularity=TAG_BITS)
 652
 653         # Get victim way from plru
 654         sync += r.store_way.eq(replace_way)
 655
 656         # Force misses on that way while reloading that line
 657         cv = Signal(INDEX_BITS)
 658         comb += cv.eq(cache_valids[req_index])
 659         comb += cv.bit_select(replace_way, 1).eq(0)
 660         sync += cache_valids[req_index].eq(cv)
 661
 662         # use write-port "granularity" to select the tag to write to
 663         # TODO: the Memory should be multipled-up (by NUM_TAGS)
 664         tagset = Signal(TAG_RAM_WIDTH)
 665         comb += tagset.eq(r.store_tag << (replace_way*TAG_BITS))
 666         comb += wr_tag.en.eq(1<<replace_way)
 667         comb += wr_tag.addr.eq(r.store_index)
 668         comb += wr_tag.data.eq(tagset)
 669
 670         sync += r.state.eq(State.WAIT_ACK)
 671
 672     def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
 673                              cache_valids, stbs_done):
 674         comb = m.d.comb
 675         sync = m.d.sync
 676
 677         bus = self.bus
 678
 679         # Requests are all sent if stb is 0
 680         stbs_zero = Signal()
 681         comb += stbs_zero.eq(r.wb.stb == 0)
 682         comb += stbs_done.eq(stbs_zero)
 683
 684         # If we are still sending requests, was one accepted?
 685         with m.If(~bus.stall & ~stbs_zero):
 686             # That was the last word? We are done sending.
 687             # Clear stb and set stbs_done so we can handle
 688             # an eventual last ack on the same cycle.
 689             with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
 690                 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
 691                          "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
 692                          "stbs_done:%x", r.wb.adr, r.end_row_ix,
 693                          r.wb.stb, stbs_zero, stbs_done)
 694                 sync += r.wb.stb.eq(0)
 695                 comb += stbs_done.eq(1)
 696
 697             # Calculate the next row address
 698             rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
 699             comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
 700             sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
 701             sync += Display("RARANGE r.req_adr:%x rarange:%x "
 702                             "stbs_zero:%x stbs_done:%x",
 703                             r.req_adr, rarange, stbs_zero, stbs_done)
 704
 705         # Incoming acks processing
 706         with m.If(bus.ack):
 707             sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
 708                             "stbs_done:%x",
 709                             bus.dat_r, stbs_zero, stbs_done)
 710
 711             sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
 712
 713             # Check for completion
 714             with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
 715                 # Complete wishbone cycle
 716                 sync += r.wb.cyc.eq(0)
 717                 # be nice, clear addr
 718                 sync += r.req_adr.eq(0)
 719
 720                 # Cache line is now valid
 721                 cv = Signal(INDEX_BITS)
 722                 comb += cv.eq(cache_valids[r.store_index])
 723                 comb += cv.bit_select(replace_way, 1).eq(
 724                          r.store_valid & ~inval_in)
 725                 sync += cache_valids[r.store_index].eq(cv)
 726
 727                 sync += r.state.eq(State.IDLE)
 728
 729             # move on to next request in row
 730             # Increment store row counter
 731             sync += r.store_row.eq(next_row(r.store_row))
 732
 733     # Cache miss/reload synchronous machine
 734     def icache_miss(self, m, r, req_is_miss,
 735                     req_index, req_laddr, req_tag, replace_way,
 736                     cache_valids, access_ok, real_addr):
 737         comb = m.d.comb
 738         sync = m.d.sync
 739
 740         i_in, bus, m_in  = self.i_in, self.bus, self.m_in
 741         stall_in, flush_in = self.stall_in, self.flush_in
 742         inval_in           = self.inval_in
 743
 744         stbs_done = Signal()
 745
 746         comb += r.wb.sel.eq(-1)
 747         comb += r.wb.adr.eq(r.req_adr[3:])
 748
 749         # Process cache invalidations
 750         with m.If(inval_in):
 751             for i in range(NUM_LINES):
 752                 sync += cache_valids[i].eq(0)
 753             sync += r.store_valid.eq(0)
 754
 755         # Main state machine
 756         with m.Switch(r.state):
 757
 758             with m.Case(State.IDLE):
 759                 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
 760                                       req_index, req_tag, replace_way,
 761                                       real_addr)
 762
 763             with m.Case(State.CLR_TAG, State.WAIT_ACK):
 764                 with m.If(r.state == State.CLR_TAG):
 765                     self.icache_miss_clr_tag(m, r, replace_way,
 766                                              req_index,
 767                                              cache_valids)
 768
 769                 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
 770                                           cache_valids, stbs_done)
 771
 772         # TLB miss and protection fault processing
 773         with m.If(flush_in | m_in.tlbld):
 774             sync += r.fetch_failed.eq(0)
 775         with m.Elif(i_in.req & ~access_ok & ~stall_in):
 776             sync += r.fetch_failed.eq(1)
 777
 778     # icache_log: if LOG_LENGTH > 0 generate
 779     def icache_log(self, m, req_hit_way, ra_valid, access_ok,
 780                    req_is_miss, req_is_hit, lway, wstate, r):
 781         comb = m.d.comb
 782         sync = m.d.sync
 783
 784         bus, i_out       = self.bus, self.i_out
 785         log_out, stall_out = self.log_out, self.stall_out
 786
 787         # Output data to logger
 788         for i in range(LOG_LENGTH):
 789             log_data = Signal(54)
 790             lway     = Signal(WAY_BITS)
 791             wstate   = Signal()
 792
 793             sync += lway.eq(req_hit_way)
 794             sync += wstate.eq(0)
 795
 796             with m.If(r.state != State.IDLE):
 797                 sync += wstate.eq(1)
 798
 799             sync += log_data.eq(Cat(
 800                      ra_valid, access_ok, req_is_miss, req_is_hit,
 801                      lway, wstate, r.hit_nia[2:6], r.fetch_failed,
 802                      stall_out, bus.stall, r.wb.cyc, r.wb.stb,
 803                      r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
 804                     ))
 805             comb += log_out.eq(log_data)
 806
 807     def elaborate(self, platform):
 808
 809         m                = Module()
 810         comb             = m.d.comb
 811
 812         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
 813         cache_valids     = CacheValidsArray()
 814
 815         # TLB Array
 816         itlb            = TLBArray()
 817         itlb_valid      = TLBValidArray()
 818
 819         # TODO to be passed to nmigen as ram attributes
 820         # attribute ram_style of itlb_tags : signal is "distributed";
 821         # attribute ram_style of itlb_ptes : signal is "distributed";
 822
 823         # Privilege bit from PTE EAA field
 824         eaa_priv         = Signal()
 825
 826         r                = RegInternal()
 827
 828         # Async signal on incoming request
 829         req_index        = Signal(INDEX_BITS)
 830         req_row          = Signal(ROW_BITS)
 831         req_hit_way      = Signal(WAY_BITS)
 832         req_tag          = Signal(TAG_BITS)
 833         req_is_hit       = Signal()
 834         req_is_miss      = Signal()
 835         req_laddr        = Signal(64)
 836
 837         tlb_req_index    = Signal(TLB_BITS)
 838         real_addr        = Signal(REAL_ADDR_BITS)
 839         ra_valid         = Signal()
 840         priv_fault       = Signal()
 841         access_ok        = Signal()
 842         use_previous     = Signal()
 843
 844         cache_out_row    = Signal(ROW_SIZE_BITS)
 845
 846         plru_victim      = Signal(WAY_BITS)
 847         replace_way      = Signal(WAY_BITS)
 848
 849         self.tlbmem = Memory(depth=TLB_SIZE, width=TLB_EA_TAG_BITS+TLB_PTE_BITS)
 850         self.tagmem = Memory(depth=NUM_LINES, width=TAG_RAM_WIDTH)
 851
 852         # call sub-functions putting everything together,
 853         # using shared signals established above
 854         self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
 855         self.maybe_plrus(m, r, plru_victim)
 856         self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
 857                          ra_valid, eaa_priv, priv_fault,
 858                          access_ok)
 859         self.itlb_update(m, itlb, itlb_valid)
 860         self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
 861                          req_tag, real_addr, req_laddr,
 862                          cache_valids,
 863                          access_ok, req_is_hit, req_is_miss,
 864                          replace_way, plru_victim, cache_out_row)
 865         self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
 866                         req_index, req_tag, real_addr)
 867         self.icache_miss(m, r, req_is_miss, req_index,
 868                          req_laddr, req_tag, replace_way,
 869                          cache_valids,
 870                          access_ok, real_addr)
 871         #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
 872         #                req_is_miss, req_is_hit, lway, wstate, r)
 873
 874         # don't connect up to FetchUnitInterface so that some unit tests
 875         # can continue to operate
 876         if not self.use_fetch_iface:
 877             return m
 878
 879         # connect to FetchUnitInterface. FetchUnitInterface is undocumented
 880         # so needs checking and iterative revising
 881         i_in, bus, i_out = self.i_in, self.bus, self.i_out
 882         comb += i_in.req.eq(self.a_i_valid)
 883         comb += i_in.nia.eq(self.a_pc_i)
 884         comb += self.stall_in.eq(self.a_stall_i)
 885         comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
 886         comb += self.f_badaddr_o.eq(i_out.nia)
 887         comb += self.f_instr_o.eq(i_out.insn)
 888         comb += self.f_busy_o.eq(~i_out.valid) # probably
 889
 890         # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
 891         ibus = self.ibus
 892         comb += ibus.adr.eq(self.bus.adr)
 893         comb += ibus.dat_w.eq(self.bus.dat_w)
 894         comb += ibus.sel.eq(self.bus.sel)
 895         comb += ibus.cyc.eq(self.bus.cyc)
 896         comb += ibus.stb.eq(self.bus.stb)
 897         comb += ibus.we.eq(self.bus.we)
 898
 899         comb += self.bus.dat_r.eq(ibus.dat_r)
 900         comb += self.bus.ack.eq(ibus.ack)
 901         if hasattr(ibus, "stall"):
 902             comb += self.bus.stall.eq(ibus.stall)
 903         else:
 904             # fake-up the wishbone stall signal to comply with pipeline mode
 905             # same thing is done in dcache.py
 906             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
 907
 908         return m
 909
 910
 911 def icache_sim(dut):
 912     i_in = dut.i_in
 913     i_out  = dut.i_out
 914     m_out = dut.m_in
 915
 916     yield i_in.priv_mode.eq(1)
 917     yield i_in.req.eq(0)
 918     yield i_in.nia.eq(0)
 919     yield i_in.stop_mark.eq(0)
 920     yield m_out.tlbld.eq(0)
 921     yield m_out.tlbie.eq(0)
 922     yield m_out.addr.eq(0)
 923     yield m_out.pte.eq(0)
 924     yield
 925     yield
 926     yield
 927     yield
 928
 929     # miss, stalls for a bit
 930     yield i_in.req.eq(1)
 931     yield i_in.nia.eq(Const(0x0000000000000004, 64))
 932     yield
 933     valid = yield i_out.valid
 934     while not valid:
 935         yield
 936         valid = yield i_out.valid
 937     yield i_in.req.eq(0)
 938
 939     insn  = yield i_out.insn
 940     nia   = yield i_out.nia
 941     assert insn == 0x00000001, \
 942         "insn @%x=%x expected 00000001" % (nia, insn)
 943     yield i_in.req.eq(0)
 944     yield
 945
 946     # hit
 947     yield i_in.req.eq(1)
 948     yield i_in.nia.eq(Const(0x0000000000000008, 64))
 949     yield
 950     valid = yield i_out.valid
 951     while not valid:
 952         yield
 953         valid = yield i_out.valid
 954     yield i_in.req.eq(0)
 955
 956     nia   = yield i_out.nia
 957     insn  = yield i_out.insn
 958     yield
 959     assert insn == 0x00000002, \
 960         "insn @%x=%x expected 00000002" % (nia, insn)
 961
 962     # another miss
 963     yield i_in.req.eq(1)
 964     yield i_in.nia.eq(Const(0x0000000000000040, 64))
 965     yield
 966     valid = yield i_out.valid
 967     while not valid:
 968         yield
 969         valid = yield i_out.valid
 970     yield i_in.req.eq(0)
 971
 972     nia   = yield i_in.nia
 973     insn  = yield i_out.insn
 974     assert insn == 0x00000010, \
 975         "insn @%x=%x expected 00000010" % (nia, insn)
 976
 977     # test something that aliases (this only works because
 978     # the unit test SRAM is a depth of 512)
 979     yield i_in.req.eq(1)
 980     yield i_in.nia.eq(Const(0x0000000000000100, 64))
 981     yield
 982     yield
 983     valid = yield i_out.valid
 984     assert ~valid
 985     for i in range(30):
 986         yield
 987     yield
 988     insn  = yield i_out.insn
 989     valid = yield i_out.valid
 990     insn  = yield i_out.insn
 991     assert valid
 992     assert insn == 0x00000040, \
 993          "insn @%x=%x expected 00000040" % (nia, insn)
 994     yield i_in.req.eq(0)
 995
 996
 997 def test_icache(mem):
 998     from soc.config.test.test_loadstore import TestMemPspec
 999     pspec = TestMemPspec(addr_wid=32,
1000                          mask_wid=8,
1001                          reg_wid=64,
1002                          )
1003     dut    = ICache(pspec)
1004
1005     memory = Memory(width=64, depth=512, init=mem)
1006     sram   = SRAM(memory=memory, granularity=8)
1007
1008     m      = Module()
1009
1010     m.submodules.icache = dut
1011     m.submodules.sram   = sram
1012
1013     m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
1014     m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1015     m.d.comb += sram.bus.we.eq(dut.bus.we)
1016     m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1017     m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1018     m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1019
1020     m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1021     m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1022
1023     # nmigen Simulation
1024     sim = Simulator(m)
1025     sim.add_clock(1e-6)
1026
1027     sim.add_sync_process(wrap(icache_sim(dut)))
1028     with sim.write_vcd('test_icache.vcd'):
1029          sim.run()
1030
1031
1032 if __name__ == '__main__':
1033     from soc.config.test.test_loadstore import TestMemPspec
1034     pspec = TestMemPspec(addr_wid=64,
1035                          mask_wid=8,
1036                          reg_wid=64,
1037                          )
1038     dut = ICache(pspec)
1039     vl = rtlil.convert(dut, ports=[])
1040     with open("test_icache.il", "w") as f:
1041         f.write(vl)
1042
1043     # set up memory every 32-bits with incrementing values 0 1 2 ...
1044     mem = []
1045     for i in range(512):
1046         mem.append((i*2) | ((i*2+1)<<32))
1047
1048     test_icache(mem)