src/soc/experiment/icache.py

   1 """ICache
   2
   3 based on Anton Blanchard microwatt icache.vhdl
   4
   5 Set associative icache
   6
   7 TODO (in no specific order):
   8 * Add debug interface to inspect cache content
   9 * Add snoop/invalidate path
  10 * Add multi-hit error detection
  11 * Pipelined bus interface (wb or axi)
  12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
  13 * Add optimization: service hits on partially loaded lines
  14 * Add optimization: (maybe) interrupt reload on fluch/redirect
  15 * Check if playing with the geometry of the cache tags allow for more
  16   efficient use of distributed RAM and less logic/muxes. Currently we
  17   write TAG_BITS width which may not match full ram blocks and might
  18   cause muxes to be inferred for "partial writes".
  19 * Check if making the read size of PLRU a ROM helps utilization
  20 """
  21
  22 from enum import (Enum, unique)
  23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
  24                     Record)
  25 from nmigen.cli import main, rtlil
  26 from nmutil.iocontrol import RecordObject
  27 from nmigen.utils import log2_int
  28 from nmutil.util import Display
  29
  30 #from nmutil.plru import PLRU
  31 from soc.experiment.cache_ram import CacheRam
  32 from soc.experiment.plru import PLRU
  33
  34 from soc.experiment.mem_types import (Fetch1ToICacheType,
  35                                       ICacheToDecode1Type,
  36                                       MMUToICacheType)
  37
  38 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
  39                                      WB_SEL_BITS, WBAddrType, WBDataType,
  40                                      WBSelType, WBMasterOut, WBSlaveOut,
  41                                      )
  42
  43 from nmigen_soc.wishbone.bus import Interface
  44
  45 # for test
  46 from soc.bus.sram import SRAM
  47 from nmigen import Memory
  48 from nmutil.util import wrap
  49 from nmigen.cli import main, rtlil
  50
  51 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  52 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  53 from nmutil.sim_tmp_alternative import Simulator, Settle
  54
  55
  56 SIM            = 0
  57 LINE_SIZE      = 64
  58 # BRAM organisation: We never access more than wishbone_data_bits
  59 # at a time so to save resources we make the array only that wide,
  60 # and use consecutive indices for to make a cache "line"
  61 #
  62 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
  63 ROW_SIZE       = WB_DATA_BITS // 8
  64 # Number of lines in a set
  65 NUM_LINES      = 16
  66 # Number of ways
  67 NUM_WAYS       = 4
  68 # L1 ITLB number of entries (direct mapped)
  69 TLB_SIZE       = 64
  70 # L1 ITLB log_2(page_size)
  71 TLB_LG_PGSZ    = 12
  72 # Number of real address bits that we store
  73 REAL_ADDR_BITS = 56
  74 # Non-zero to enable log data collection
  75 LOG_LENGTH     = 0
  76
  77 ROW_SIZE_BITS  = ROW_SIZE * 8
  78 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
  79 ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
  80 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
  81 BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
  82 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
  83 INSN_PER_ROW   = ROW_SIZE_BITS // 32
  84
  85 # Bit fields counts in the address
  86 #
  87 # INSN_BITS is the number of bits to select an instruction in a row
  88 INSN_BITS      = log2_int(INSN_PER_ROW)
  89 # ROW_BITS is the number of bits to select a row
  90 ROW_BITS       = log2_int(BRAM_ROWS)
  91 # ROW_LINE_BITS is the number of bits to select a row within a line
  92 ROW_LINE_BITS  = log2_int(ROW_PER_LINE)
  93 # LINE_OFF_BITS is the number of bits for the offset in a cache line
  94 LINE_OFF_BITS  = log2_int(LINE_SIZE)
  95 # ROW_OFF_BITS is the number of bits for the offset in a row
  96 ROW_OFF_BITS   = log2_int(ROW_SIZE)
  97 # INDEX_BITS is the number of bits to select a cache line
  98 INDEX_BITS     = log2_int(NUM_LINES)
  99 # SET_SIZE_BITS is the log base 2 of the set size
 100 SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
 101 # TAG_BITS is the number of bits of the tag part of the address
 102 TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
 103 # TAG_WIDTH is the width in bits of each way of the tag RAM
 104 TAG_WIDTH      = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 105
 106 # WAY_BITS is the number of bits to select a way
 107 WAY_BITS       = log2_int(NUM_WAYS)
 108 TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
 109
 110 # L1 ITLB
 111 TLB_BITS        = log2_int(TLB_SIZE)
 112 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
 113 TLB_PTE_BITS    = 64
 114
 115 print("BRAM_ROWS       =", BRAM_ROWS)
 116 print("INDEX_BITS      =", INDEX_BITS)
 117 print("INSN_BITS       =", INSN_BITS)
 118 print("INSN_PER_ROW    =", INSN_PER_ROW)
 119 print("LINE_SIZE       =", LINE_SIZE)
 120 print("LINE_OFF_BITS   =", LINE_OFF_BITS)
 121 print("LOG_LENGTH      =", LOG_LENGTH)
 122 print("NUM_LINES       =", NUM_LINES)
 123 print("NUM_WAYS        =", NUM_WAYS)
 124 print("REAL_ADDR_BITS  =", REAL_ADDR_BITS)
 125 print("ROW_BITS        =", ROW_BITS)
 126 print("ROW_OFF_BITS    =", ROW_OFF_BITS)
 127 print("ROW_LINE_BITS   =", ROW_LINE_BITS)
 128 print("ROW_PER_LINE    =", ROW_PER_LINE)
 129 print("ROW_SIZE        =", ROW_SIZE)
 130 print("ROW_SIZE_BITS   =", ROW_SIZE_BITS)
 131 print("SET_SIZE_BITS   =", SET_SIZE_BITS)
 132 print("SIM             =", SIM)
 133 print("TAG_BITS        =", TAG_BITS)
 134 print("TAG_RAM_WIDTH   =", TAG_RAM_WIDTH)
 135 print("TAG_BITS        =", TAG_BITS)
 136 print("TLB_BITS        =", TLB_BITS)
 137 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
 138 print("TLB_LG_PGSZ     =", TLB_LG_PGSZ)
 139 print("TLB_PTE_BITS    =", TLB_PTE_BITS)
 140 print("TLB_SIZE        =", TLB_SIZE)
 141 print("WAY_BITS        =", WAY_BITS)
 142
 143 # from microwatt/utils.vhdl
 144 def ispow2(n):
 145     return n != 0 and (n & (n - 1)) == 0
 146
 147 assert LINE_SIZE % ROW_SIZE == 0
 148 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 149 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 150 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 151 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
 152 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
 153     "geometry bits don't add up"
 154 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
 155    "geometry bits don't add up"
 156 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
 157     "geometry bits don't add up"
 158 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
 159     "geometry bits don't add up"
 160
 161 # Example of layout for 32 lines of 64 bytes:
 162 #
 163 # ..  tag    |index|  line  |
 164 # ..         |   row   |    |
 165 # ..         |     |   | |00| zero          (2)
 166 # ..         |     |   |-|  | INSN_BITS     (1)
 167 # ..         |     |---|    | ROW_LINE_BITS  (3)
 168 # ..         |     |--- - --| LINE_OFF_BITS (6)
 169 # ..         |         |- --| ROW_OFF_BITS  (3)
 170 # ..         |----- ---|    | ROW_BITS      (8)
 171 # ..         |-----|        | INDEX_BITS    (5)
 172 # .. --------|              | TAG_BITS      (53)
 173
 174 # The cache data BRAM organized as described above for each way
 175 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
 176 #
 177 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
 178 # not handle a clean (commented) definition of the cache tags as a 3d
 179 # memory. For now, work around it by putting all the tags
 180 def CacheTagArray():
 181     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
 182                  for x in range(NUM_LINES))
 183
 184 # The cache valid bits
 185 def CacheValidBitsArray():
 186     return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
 187                  for x in range(NUM_LINES))
 188
 189 def RowPerLineValidArray():
 190     return Array(Signal(name="rows_valid_%d" %x) \
 191                  for x in range(ROW_PER_LINE))
 192
 193
 194 # TODO to be passed to nigmen as ram attributes
 195 # attribute ram_style : string;
 196 # attribute ram_style of cache_tags : signal is "distributed";
 197
 198 tlb_layout = [('valid', 1),
 199               ('tag', TLB_EA_TAG_BITS),
 200               ('pte', TLB_PTE_BITS)
 201              ]
 202
 203 def TLBArray():
 204     return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE))
 205
 206 # Cache RAM interface
 207 def CacheRamOut():
 208     return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
 209                  for x in range(NUM_WAYS))
 210
 211 # PLRU output interface
 212 def PLRUOut():
 213     return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
 214                  for x in range(NUM_LINES))
 215
 216 # Return the cache line index (tag index) for an address
 217 def get_index(addr):
 218     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 219
 220 # Return the cache row index (data memory) for an address
 221 def get_row(addr):
 222     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 223
 224 # Return the index of a row within a line
 225 def get_row_of_line(row):
 226     return row[:ROW_BITS][:ROW_LINE_BITS]
 227
 228 # Returns whether this is the last row of a line
 229 def is_last_row_addr(addr, last):
 230     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 231
 232 # Returns whether this is the last row of a line
 233 def is_last_row(row, last):
 234     return get_row_of_line(row) == last
 235
 236 # Return the next row in the current cache line. We use a dedicated
 237 # function in order to limit the size of the generated adder to be
 238 # only the bits within a cache line (3 bits with default settings)
 239 def next_row(row):
 240     row_v = row[0:ROW_LINE_BITS] + 1
 241     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 242
 243 # Read the instruction word for the given address
 244 # in the current cache row
 245 def read_insn_word(addr, data):
 246     word = addr[2:INSN_BITS+2]
 247     return data.word_select(word, 32)
 248
 249 # Get the tag value from the address
 250 def get_tag(addr):
 251     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 252
 253 # Read a tag from a tag memory row
 254 def read_tag(way, tagset):
 255     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 256
 257 # Write a tag to tag memory row
 258 def write_tag(way, tagset, tag):
 259     return read_tag(way, tagset).eq(tag)
 260
 261 # Simple hash for direct-mapped TLB index
 262 def hash_ea(addr):
 263     hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
 264            TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
 265           ] ^ addr[
 266            TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
 267           ]
 268     return hsh
 269
 270
 271 # Cache reload state machine
 272 @unique
 273 class State(Enum):
 274     IDLE     = 0
 275     CLR_TAG  = 1
 276     WAIT_ACK = 2
 277
 278
 279 class RegInternal(RecordObject):
 280     def __init__(self):
 281         super().__init__()
 282         # Cache hit state (Latches for 1 cycle BRAM access)
 283         self.hit_way      = Signal(WAY_BITS)
 284         self.hit_nia      = Signal(64)
 285         self.hit_smark    = Signal()
 286         self.hit_valid    = Signal()
 287
 288         # Cache miss state (reload state machine)
 289         self.state        = Signal(State, reset=State.IDLE)
 290         self.wb           = WBMasterOut("wb")
 291         self.req_adr      = Signal(64)
 292         self.store_way    = Signal(WAY_BITS)
 293         self.store_index  = Signal(INDEX_BITS)
 294         self.store_row    = Signal(ROW_BITS)
 295         self.store_tag    = Signal(TAG_BITS)
 296         self.store_valid  = Signal()
 297         self.end_row_ix   = Signal(ROW_LINE_BITS)
 298         self.rows_valid   = RowPerLineValidArray()
 299
 300         # TLB miss state
 301         self.fetch_failed = Signal()
 302
 303
 304 class ICache(Elaboratable):
 305     """64 bit direct mapped icache. All instructions are 4B aligned."""
 306     def __init__(self):
 307         self.i_in           = Fetch1ToICacheType(name="i_in")
 308         self.i_out          = ICacheToDecode1Type(name="i_out")
 309
 310         self.m_in           = MMUToICacheType(name="m_in")
 311
 312         self.stall_in       = Signal()
 313         self.stall_out      = Signal()
 314         self.flush_in       = Signal()
 315         self.inval_in       = Signal()
 316
 317         # standard naming (wired to non-standard for compatibility)
 318         self.bus = Interface(addr_width=32,
 319                             data_width=64,
 320                             granularity=8,
 321                             features={'stall'},
 322                             alignment=0,
 323                             name="dcache")
 324
 325         self.log_out        = Signal(54)
 326
 327
 328     # Generate a cache RAM for each way
 329     def rams(self, m, r, cache_out_row, use_previous,
 330              replace_way, req_row):
 331
 332         comb = m.d.comb
 333         sync = m.d.sync
 334
 335         bus, stall_in = self.bus, self.stall_in
 336
 337         for i in range(NUM_WAYS):
 338             do_read  = Signal(name="do_rd_%d" % i)
 339             do_write = Signal(name="do_wr_%d" % i)
 340             rd_addr  = Signal(ROW_BITS)
 341             wr_addr  = Signal(ROW_BITS)
 342             d_out    = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
 343             wr_sel   = Signal(ROW_SIZE)
 344
 345             way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True, ram_num=i)
 346             setattr(m.submodules, "cacheram_%d" % i, way)
 347
 348             comb += way.rd_en.eq(do_read)
 349             comb += way.rd_addr.eq(rd_addr)
 350             comb += d_out.eq(way.rd_data_o)
 351             comb += way.wr_sel.eq(wr_sel)
 352             comb += way.wr_addr.eq(wr_addr)
 353             comb += way.wr_data.eq(bus.dat_r)
 354
 355             comb += do_read.eq(~(stall_in | use_previous))
 356             comb += do_write.eq(bus.ack & (replace_way == i))
 357
 358             with m.If(do_write):
 359                 sync += Display("cache write adr: %x data: %lx",
 360                                 wr_addr, way.wr_data)
 361
 362             with m.If(r.hit_way == i):
 363                 comb += cache_out_row.eq(d_out)
 364                 with m.If(do_read):
 365                     sync += Display("cache read adr: %x data: %x",
 366                                      req_row, d_out)
 367
 368             comb += rd_addr.eq(req_row)
 369             comb += wr_addr.eq(r.store_row)
 370             comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
 371
 372     # Generate PLRUs
 373     def maybe_plrus(self, m, r, plru_victim):
 374         comb = m.d.comb
 375
 376         with m.If(NUM_WAYS > 1):
 377             for i in range(NUM_LINES):
 378                 plru_acc_i  = Signal(WAY_BITS)
 379                 plru_acc_en = Signal()
 380                 plru        = PLRU(WAY_BITS)
 381                 setattr(m.submodules, "plru_%d" % i, plru)
 382
 383                 comb += plru.acc_i.eq(plru_acc_i)
 384                 comb += plru.acc_en.eq(plru_acc_en)
 385
 386                 # PLRU interface
 387                 with m.If(get_index(r.hit_nia) == i):
 388                     comb += plru.acc_en.eq(r.hit_valid)
 389
 390                 comb += plru.acc_i.eq(r.hit_way)
 391                 comb += plru_victim[i].eq(plru.lru_o)
 392
 393     # TLB hit detection and real address generation
 394     def itlb_lookup(self, m, tlb_req_index, itlb,
 395                     real_addr, ra_valid, eaa_priv,
 396                     priv_fault, access_ok):
 397
 398         comb = m.d.comb
 399
 400         i_in = self.i_in
 401
 402         pte  = Signal(TLB_PTE_BITS)
 403         ttag = Signal(TLB_EA_TAG_BITS)
 404
 405         comb += tlb_req_index.eq(hash_ea(i_in.nia))
 406         comb += pte.eq(itlb[tlb_req_index].pte)
 407         comb += ttag.eq(itlb[tlb_req_index].tag)
 408
 409         with m.If(i_in.virt_mode):
 410             comb += real_addr.eq(Cat(
 411                      i_in.nia[:TLB_LG_PGSZ],
 412                      pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
 413                     ))
 414
 415             with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
 416                 comb += ra_valid.eq(itlb[tlb_req_index].valid)
 417
 418             comb += eaa_priv.eq(pte[3])
 419
 420         with m.Else():
 421             comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
 422             comb += ra_valid.eq(1)
 423             comb += eaa_priv.eq(1)
 424
 425         # No IAMR, so no KUEP support for now
 426         comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
 427         comb += access_ok.eq(ra_valid & ~priv_fault)
 428
 429     # iTLB update
 430     def itlb_update(self, m, itlb):
 431         comb = m.d.comb
 432         sync = m.d.sync
 433
 434         m_in = self.m_in
 435
 436         wr_index = Signal(TLB_SIZE)
 437         comb += wr_index.eq(hash_ea(m_in.addr))
 438
 439         with m.If(m_in.tlbie & m_in.doall):
 440             # Clear all valid bits
 441             for i in range(TLB_SIZE):
 442                 sync += itlb[i].valid.eq(0)
 443
 444         with m.Elif(m_in.tlbie):
 445             # Clear entry regardless of hit or miss
 446             sync += itlb[wr_index].valid.eq(0)
 447
 448         with m.Elif(m_in.tlbld):
 449             sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
 450             sync += itlb[wr_index].pte.eq(m_in.pte)
 451             sync += itlb[wr_index].valid.eq(1)
 452
 453     # Cache hit detection, output to fetch2 and other misc logic
 454     def icache_comb(self, m, use_previous, r, req_index, req_row,
 455                     req_hit_way, req_tag, real_addr, req_laddr,
 456                     cache_valid_bits, cache_tags, access_ok,
 457                     req_is_hit, req_is_miss, replace_way,
 458                     plru_victim, cache_out_row):
 459
 460         comb = m.d.comb
 461
 462         i_in, i_out, bus = self.i_in, self.i_out, self.bus
 463         flush_in, stall_out = self.flush_in, self.stall_out
 464
 465         is_hit  = Signal()
 466         hit_way = Signal(WAY_BITS)
 467
 468         # i_in.sequential means that i_in.nia this cycle is 4 more than
 469         # last cycle.  If we read more than 32 bits at a time, had a
 470         # cache hit last cycle, and we don't want the first 32-bit chunk
 471         # then we can keep the data we read last cycle and just use that.
 472         with m.If(i_in.nia[2:INSN_BITS+2] != 0):
 473             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 474
 475         # Extract line, row and tag from request
 476         comb += req_index.eq(get_index(i_in.nia))
 477         comb += req_row.eq(get_row(i_in.nia))
 478         comb += req_tag.eq(get_tag(real_addr))
 479
 480         # Calculate address of beginning of cache row, will be
 481         # used for cache miss processing if needed
 482         comb += req_laddr.eq(Cat(
 483                  Const(0, ROW_OFF_BITS),
 484                  real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
 485                 ))
 486
 487         # Test if pending request is a hit on any way
 488         hitcond = Signal()
 489         comb += hitcond.eq((r.state == State.WAIT_ACK)
 490                  & (req_index == r.store_index)
 491                  & r.rows_valid[req_row % ROW_PER_LINE]
 492                 )
 493         with m.If(i_in.req):
 494             cvb = Signal(NUM_WAYS)
 495             ctag = Signal(TAG_RAM_WIDTH)
 496             comb += ctag.eq(cache_tags[req_index])
 497             comb += cvb.eq(cache_valid_bits[req_index])
 498             for i in range(NUM_WAYS):
 499                 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
 500                 comb += tagi.eq(read_tag(i, ctag))
 501                 hit_test = Signal(name="hit_test%d" % i)
 502                 comb += hit_test.eq(i == r.store_way)
 503                 with m.If((cvb[i] | (hitcond & hit_test))
 504                           & (tagi == req_tag)):
 505                     comb += hit_way.eq(i)
 506                     comb += is_hit.eq(1)
 507
 508         # Generate the "hit" and "miss" signals
 509         # for the synchronous blocks
 510         with m.If(i_in.req & access_ok & ~flush_in):
 511             comb += req_is_hit.eq(is_hit)
 512             comb += req_is_miss.eq(~is_hit)
 513
 514         comb += req_hit_way.eq(hit_way)
 515
 516         # The way to replace on a miss
 517         with m.If(r.state == State.CLR_TAG):
 518             comb += replace_way.eq(plru_victim[r.store_index])
 519         with m.Else():
 520             comb += replace_way.eq(r.store_way)
 521
 522         # Output instruction from current cache row
 523         #
 524         # Note: This is a mild violation of our design principle of
 525         # having pipeline stages output from a clean latch. In this
 526         # case we output the result of a mux. The alternative would
 527         # be output an entire row which I prefer not to do just yet
 528         # as it would force fetch2 to know about some of the cache
 529         # geometry information.
 530         comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
 531         comb += i_out.valid.eq(r.hit_valid)
 532         comb += i_out.nia.eq(r.hit_nia)
 533         comb += i_out.stop_mark.eq(r.hit_smark)
 534         comb += i_out.fetch_failed.eq(r.fetch_failed)
 535
 536         # Stall fetch1 if we have a miss on cache or TLB
 537         # or a protection fault
 538         comb += stall_out.eq(~(is_hit & access_ok))
 539
 540         # Wishbone requests output (from the cache miss reload machine)
 541         comb += bus.we.eq(r.wb.we)
 542         comb += bus.adr.eq(r.wb.adr)
 543         comb += bus.sel.eq(r.wb.sel)
 544         comb += bus.stb.eq(r.wb.stb)
 545         comb += bus.dat_w.eq(r.wb.dat)
 546         comb += bus.cyc.eq(r.wb.cyc)
 547
 548     # Cache hit synchronous machine
 549     def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
 550                    req_index, req_tag, real_addr):
 551         sync = m.d.sync
 552
 553         i_in, stall_in = self.i_in, self.stall_in
 554         flush_in       = self.flush_in
 555
 556         # keep outputs to fetch2 unchanged on a stall
 557         # except that flush or reset sets valid to 0
 558         # If use_previous, keep the same data as last
 559         # cycle and use the second half
 560         with m.If(stall_in | use_previous):
 561             with m.If(flush_in):
 562                 sync += r.hit_valid.eq(0)
 563         with m.Else():
 564             # On a hit, latch the request for the next cycle,
 565             # when the BRAM data will be available on the
 566             # cache_out output of the corresponding way
 567             sync += r.hit_valid.eq(req_is_hit)
 568
 569             with m.If(req_is_hit):
 570                 sync += r.hit_way.eq(req_hit_way)
 571                 sync += Display(
 572                          "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
 573                          "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
 574                          i_in.stop_mark, req_index, req_tag, \
 575                          req_hit_way, real_addr
 576                         )
 577
 578
 579
 580         with m.If(~stall_in):
 581             # Send stop marks and NIA down regardless of validity
 582             sync += r.hit_smark.eq(i_in.stop_mark)
 583             sync += r.hit_nia.eq(i_in.nia)
 584
 585     def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
 586                          req_index, req_tag, replace_way, real_addr):
 587         comb = m.d.comb
 588         sync = m.d.sync
 589
 590         i_in = self.i_in
 591
 592         # Reset per-row valid flags, only used in WAIT_ACK
 593         for i in range(ROW_PER_LINE):
 594             sync += r.rows_valid[i].eq(0)
 595
 596         # We need to read a cache line
 597         with m.If(req_is_miss):
 598             sync += Display(
 599                      "cache miss nia:%x IR:%x SM:%x idx:%x "
 600                      " way:%x tag:%x RA:%x", i_in.nia,
 601                      i_in.virt_mode, i_in.stop_mark, req_index,
 602                      replace_way, req_tag, real_addr
 603                     )
 604
 605             # Keep track of our index and way for subsequent stores
 606             st_row = Signal(ROW_BITS)
 607             comb += st_row.eq(get_row(req_laddr))
 608             sync += r.store_index.eq(req_index)
 609             sync += r.store_row.eq(st_row)
 610             sync += r.store_tag.eq(req_tag)
 611             sync += r.store_valid.eq(1)
 612             sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
 613
 614             # Prep for first wishbone read.  We calculate the address
 615             # of the start of the cache line and start the WB cycle.
 616             sync += r.req_adr.eq(req_laddr)
 617             sync += r.wb.cyc.eq(1)
 618             sync += r.wb.stb.eq(1)
 619
 620             # Track that we had one request sent
 621             sync += r.state.eq(State.CLR_TAG)
 622
 623     def icache_miss_clr_tag(self, m, r, replace_way,
 624                             cache_valid_bits, req_index,
 625                             tagset, cache_tags):
 626
 627         comb = m.d.comb
 628         sync = m.d.sync
 629
 630         # Get victim way from plru
 631         sync += r.store_way.eq(replace_way)
 632         # Force misses on that way while reloading that line
 633         cv = Signal(INDEX_BITS)
 634         comb += cv.eq(cache_valid_bits[req_index])
 635         comb += cv.bit_select(replace_way, 1).eq(0)
 636         sync += cache_valid_bits[req_index].eq(cv)
 637
 638         for i in range(NUM_WAYS):
 639             with m.If(i == replace_way):
 640                 comb += tagset.eq(cache_tags[r.store_index])
 641                 comb += write_tag(i, tagset, r.store_tag)
 642                 sync += cache_tags[r.store_index].eq(tagset)
 643
 644         sync += r.state.eq(State.WAIT_ACK)
 645
 646     def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
 647                              stbs_done, cache_valid_bits):
 648         comb = m.d.comb
 649         sync = m.d.sync
 650
 651         bus = self.bus
 652
 653         # Requests are all sent if stb is 0
 654         stbs_zero = Signal()
 655         comb += stbs_zero.eq(r.wb.stb == 0)
 656         comb += stbs_done.eq(stbs_zero)
 657
 658         # If we are still sending requests, was one accepted?
 659         with m.If(~bus.stall & ~stbs_zero):
 660             # That was the last word? We are done sending.
 661             # Clear stb and set stbs_done so we can handle
 662             # an eventual last ack on the same cycle.
 663             with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
 664                 sync += Display(
 665                          "IS_LAST_ROW_ADDR r.wb.addr:%x " \
 666                          "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
 667                          "stbs_done:%x", r.wb.adr, r.end_row_ix,
 668                          r.wb.stb, stbs_zero, stbs_done
 669                         )
 670                 sync += r.wb.stb.eq(0)
 671                 comb += stbs_done.eq(1)
 672
 673             # Calculate the next row address
 674             rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
 675             comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
 676             sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
 677             sync += Display("RARANGE r.req_adr:%x rarange:%x "
 678                             "stbs_zero:%x stbs_done:%x",
 679                             r.req_adr, rarange, stbs_zero, stbs_done)
 680
 681         # Incoming acks processing
 682         with m.If(bus.ack):
 683             sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
 684                             "stbs_done:%x",
 685                             bus.dat_r, stbs_zero, stbs_done)
 686
 687             sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
 688
 689             # Check for completion
 690             with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
 691                 # Complete wishbone cycle
 692                 sync += r.wb.cyc.eq(0)
 693                 # be nice, clear addr
 694                 sync += r.req_adr.eq(0)
 695
 696                 # Cache line is now valid
 697                 cv = Signal(INDEX_BITS)
 698                 comb += cv.eq(cache_valid_bits[r.store_index])
 699                 comb += cv.bit_select(replace_way, 1).eq(
 700                          r.store_valid & ~inval_in
 701                         )
 702                 sync += cache_valid_bits[r.store_index].eq(cv)
 703
 704                 sync += r.state.eq(State.IDLE)
 705
 706             # move on to next request in row
 707             # Increment store row counter
 708             sync += r.store_row.eq(next_row(r.store_row))
 709
 710     # Cache miss/reload synchronous machine
 711     def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
 712                     req_index, req_laddr, req_tag, replace_way,
 713                     cache_tags, access_ok, real_addr):
 714         comb = m.d.comb
 715         sync = m.d.sync
 716
 717         i_in, bus, m_in  = self.i_in, self.bus, self.m_in
 718         stall_in, flush_in = self.stall_in, self.flush_in
 719         inval_in           = self.inval_in
 720
 721         tagset    = Signal(TAG_RAM_WIDTH)
 722         stbs_done = Signal()
 723
 724         comb += r.wb.sel.eq(-1)
 725         comb += r.wb.adr.eq(r.req_adr[3:])
 726
 727         # Process cache invalidations
 728         with m.If(inval_in):
 729             for i in range(NUM_LINES):
 730                 sync += cache_valid_bits[i].eq(0)
 731             sync += r.store_valid.eq(0)
 732
 733         # Main state machine
 734         with m.Switch(r.state):
 735
 736             with m.Case(State.IDLE):
 737                 self.icache_miss_idle(
 738                     m, r, req_is_miss, req_laddr,
 739                     req_index, req_tag, replace_way,
 740                     real_addr
 741                 )
 742
 743             with m.Case(State.CLR_TAG, State.WAIT_ACK):
 744                 with m.If(r.state == State.CLR_TAG):
 745                     self.icache_miss_clr_tag(
 746                         m, r, replace_way,
 747                         cache_valid_bits, req_index,
 748                         tagset, cache_tags
 749                     )
 750
 751                 self.icache_miss_wait_ack(
 752                     m, r, replace_way, inval_in,
 753                     stbs_done, cache_valid_bits
 754                 )
 755
 756         # TLB miss and protection fault processing
 757         with m.If(flush_in | m_in.tlbld):
 758             sync += r.fetch_failed.eq(0)
 759         with m.Elif(i_in.req & ~access_ok & ~stall_in):
 760             sync += r.fetch_failed.eq(1)
 761
 762     # icache_log: if LOG_LENGTH > 0 generate
 763     def icache_log(self, m, req_hit_way, ra_valid, access_ok,
 764                    req_is_miss, req_is_hit, lway, wstate, r):
 765         comb = m.d.comb
 766         sync = m.d.sync
 767
 768         bus, i_out       = self.bus, self.i_out
 769         log_out, stall_out = self.log_out, self.stall_out
 770
 771         # Output data to logger
 772         for i in range(LOG_LENGTH):
 773             log_data = Signal(54)
 774             lway     = Signal(WAY_BITS)
 775             wstate   = Signal()
 776
 777             sync += lway.eq(req_hit_way)
 778             sync += wstate.eq(0)
 779
 780             with m.If(r.state != State.IDLE):
 781                 sync += wstate.eq(1)
 782
 783             sync += log_data.eq(Cat(
 784                      ra_valid, access_ok, req_is_miss, req_is_hit,
 785                      lway, wstate, r.hit_nia[2:6], r.fetch_failed,
 786                      stall_out, bus.stall, r.wb.cyc, r.wb.stb,
 787                      r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
 788                     ))
 789             comb += log_out.eq(log_data)
 790
 791     def elaborate(self, platform):
 792
 793         m                = Module()
 794         comb             = m.d.comb
 795
 796         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
 797         cache_tags       = CacheTagArray()
 798         cache_valid_bits = CacheValidBitsArray()
 799
 800         itlb            = TLBArray()
 801
 802         # TODO to be passed to nmigen as ram attributes
 803         # attribute ram_style of itlb_tags : signal is "distributed";
 804         # attribute ram_style of itlb_ptes : signal is "distributed";
 805
 806         # Privilege bit from PTE EAA field
 807         eaa_priv         = Signal()
 808
 809         r                = RegInternal()
 810
 811         # Async signal on incoming request
 812         req_index        = Signal(INDEX_BITS)
 813         req_row          = Signal(ROW_BITS)
 814         req_hit_way      = Signal(WAY_BITS)
 815         req_tag          = Signal(TAG_BITS)
 816         req_is_hit       = Signal()
 817         req_is_miss      = Signal()
 818         req_laddr        = Signal(64)
 819
 820         tlb_req_index    = Signal(TLB_SIZE)
 821         real_addr        = Signal(REAL_ADDR_BITS)
 822         ra_valid         = Signal()
 823         priv_fault       = Signal()
 824         access_ok        = Signal()
 825         use_previous     = Signal()
 826
 827         cache_out_row    = Signal(ROW_SIZE_BITS)
 828
 829         plru_victim      = PLRUOut()
 830         replace_way      = Signal(WAY_BITS)
 831
 832         # fake-up the wishbone stall signal to comply with pipeline mode
 833         # same thing is done in dcache.py
 834         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
 835
 836         # call sub-functions putting everything together,
 837         # using shared signals established above
 838         self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
 839         self.maybe_plrus(m, r, plru_victim)
 840         self.itlb_lookup(m, tlb_req_index, itlb, real_addr,
 841                          ra_valid, eaa_priv, priv_fault,
 842                          access_ok)
 843         self.itlb_update(m, itlb)
 844         self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
 845                          req_tag, real_addr, req_laddr, cache_valid_bits,
 846                          cache_tags, access_ok, req_is_hit, req_is_miss,
 847                          replace_way, plru_victim, cache_out_row)
 848         self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
 849                         req_index, req_tag, real_addr)
 850         self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
 851                          req_laddr, req_tag, replace_way, cache_tags,
 852                          access_ok, real_addr)
 853         #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
 854         #                req_is_miss, req_is_hit, lway, wstate, r)
 855
 856         return m
 857
 858
 859 def icache_sim(dut):
 860     i_in = dut.i_in
 861     i_out  = dut.i_out
 862     m_out = dut.m_in
 863
 864     yield i_in.priv_mode.eq(1)
 865     yield i_in.req.eq(0)
 866     yield i_in.nia.eq(0)
 867     yield i_in.stop_mark.eq(0)
 868     yield m_out.tlbld.eq(0)
 869     yield m_out.tlbie.eq(0)
 870     yield m_out.addr.eq(0)
 871     yield m_out.pte.eq(0)
 872     yield
 873     yield
 874     yield
 875     yield
 876
 877     # miss, stalls for a bit
 878     yield i_in.req.eq(1)
 879     yield i_in.nia.eq(Const(0x0000000000000004, 64))
 880     yield
 881     valid = yield i_out.valid
 882     while not valid:
 883         yield
 884         valid = yield i_out.valid
 885     yield i_in.req.eq(0)
 886
 887     insn  = yield i_out.insn
 888     nia   = yield i_out.nia
 889     assert insn == 0x00000001, \
 890         "insn @%x=%x expected 00000001" % (nia, insn)
 891     yield i_in.req.eq(0)
 892     yield
 893
 894     # hit
 895     yield i_in.req.eq(1)
 896     yield i_in.nia.eq(Const(0x0000000000000008, 64))
 897     yield
 898     valid = yield i_out.valid
 899     while not valid:
 900         yield
 901         valid = yield i_out.valid
 902     yield i_in.req.eq(0)
 903
 904     nia   = yield i_out.nia
 905     insn  = yield i_out.insn
 906     yield
 907     assert insn == 0x00000002, \
 908         "insn @%x=%x expected 00000002" % (nia, insn)
 909
 910     # another miss
 911     yield i_in.req.eq(1)
 912     yield i_in.nia.eq(Const(0x0000000000000040, 64))
 913     yield
 914     valid = yield i_out.valid
 915     while not valid:
 916         yield
 917         valid = yield i_out.valid
 918     yield i_in.req.eq(0)
 919
 920     nia   = yield i_in.nia
 921     insn  = yield i_out.insn
 922     assert insn == 0x00000010, \
 923         "insn @%x=%x expected 00000010" % (nia, insn)
 924
 925     # test something that aliases (this only works because
 926     # the unit test SRAM is a depth of 512)
 927     yield i_in.req.eq(1)
 928     yield i_in.nia.eq(Const(0x0000000000000100, 64))
 929     yield
 930     yield
 931     valid = yield i_out.valid
 932     assert ~valid
 933     for i in range(30):
 934         yield
 935     yield
 936     insn  = yield i_out.insn
 937     valid = yield i_out.valid
 938     insn  = yield i_out.insn
 939     assert valid
 940     assert insn == 0x00000040, \
 941          "insn @%x=%x expected 00000040" % (nia, insn)
 942     yield i_in.req.eq(0)
 943
 944
 945
 946 def test_icache(mem):
 947      dut    = ICache()
 948
 949      memory = Memory(width=64, depth=512, init=mem)
 950      sram   = SRAM(memory=memory, granularity=8)
 951
 952      m      = Module()
 953
 954      m.submodules.icache = dut
 955      m.submodules.sram   = sram
 956
 957      m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
 958      m.d.comb += sram.bus.stb.eq(dut.bus.stb)
 959      m.d.comb += sram.bus.we.eq(dut.bus.we)
 960      m.d.comb += sram.bus.sel.eq(dut.bus.sel)
 961      m.d.comb += sram.bus.adr.eq(dut.bus.adr)
 962      m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
 963
 964      m.d.comb += dut.bus.ack.eq(sram.bus.ack)
 965      m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
 966
 967      # nmigen Simulation
 968      sim = Simulator(m)
 969      sim.add_clock(1e-6)
 970
 971      sim.add_sync_process(wrap(icache_sim(dut)))
 972      with sim.write_vcd('test_icache.vcd'):
 973          sim.run()
 974
 975 if __name__ == '__main__':
 976     dut = ICache()
 977     vl = rtlil.convert(dut, ports=[])
 978     with open("test_icache.il", "w") as f:
 979         f.write(vl)
 980
 981     # set up memory every 32-bits with incrementing values 0 1 2 ...
 982     mem = []
 983     for i in range(512):
 984         mem.append((i*2) | ((i*2+1)<<32))
 985
 986     test_icache(mem)
 987