src/soc/experiment/icache.py

   1 """ICache
   2
   3 based on Anton Blanchard microwatt icache.vhdl
   4
   5 Set associative icache
   6
   7 TODO (in no specific order):
   8 * Add debug interface to inspect cache content
   9 * Add snoop/invalidate path
  10 * Add multi-hit error detection
  11 * Pipelined bus interface (wb or axi)
  12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
  13 * Add optimization: service hits on partially loaded lines
  14 * Add optimization: (maybe) interrupt reload on fluch/redirect
  15 * Check if playing with the geometry of the cache tags allow for more
  16   efficient use of distributed RAM and less logic/muxes. Currently we
  17   write TAG_BITS width which may not match full ram blocks and might
  18   cause muxes to be inferred for "partial writes".
  19 * Check if making the read size of PLRU a ROM helps utilization
  20 """
  21
  22 from enum import (Enum, unique)
  23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
  24                     Record)
  25 from nmigen.cli import main, rtlil
  26 from nmutil.iocontrol import RecordObject
  27 from nmigen.utils import log2_int
  28 from nmigen.lib.coding import Decoder
  29 from nmutil.util import Display
  30
  31 #from nmutil.plru import PLRU
  32 from soc.experiment.plru import PLRU, PLRUs
  33 from soc.experiment.cache_ram import CacheRam
  34
  35 from soc.experiment.mem_types import (Fetch1ToICacheType,
  36                                       ICacheToDecode1Type,
  37                                       MMUToICacheType)
  38
  39 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
  40                                      WB_SEL_BITS, WBAddrType, WBDataType,
  41                                      WBSelType, WBMasterOut, WBSlaveOut,
  42                                      )
  43
  44 from nmigen_soc.wishbone.bus import Interface
  45
  46 # for test
  47 from soc.bus.sram import SRAM
  48 from nmigen import Memory
  49 from nmutil.util import wrap
  50 from nmigen.cli import main, rtlil
  51
  52 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  53 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  54 from nmutil.sim_tmp_alternative import Simulator, Settle
  55
  56
  57 SIM            = 0
  58 LINE_SIZE      = 64
  59 # BRAM organisation: We never access more than wishbone_data_bits
  60 # at a time so to save resources we make the array only that wide,
  61 # and use consecutive indices for to make a cache "line"
  62 #
  63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
  64 ROW_SIZE       = WB_DATA_BITS // 8
  65 # Number of lines in a set
  66 NUM_LINES      = 16
  67 # Number of ways
  68 NUM_WAYS       = 4
  69 # L1 ITLB number of entries (direct mapped)
  70 TLB_SIZE       = 64
  71 # L1 ITLB log_2(page_size)
  72 TLB_LG_PGSZ    = 12
  73 # Number of real address bits that we store
  74 REAL_ADDR_BITS = 56
  75 # Non-zero to enable log data collection
  76 LOG_LENGTH     = 0
  77
  78 ROW_SIZE_BITS  = ROW_SIZE * 8
  79 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
  80 ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
  81 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
  82 BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
  83 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
  84 INSN_PER_ROW   = ROW_SIZE_BITS // 32
  85
  86 # Bit fields counts in the address
  87 #
  88 # INSN_BITS is the number of bits to select an instruction in a row
  89 INSN_BITS      = log2_int(INSN_PER_ROW)
  90 # ROW_BITS is the number of bits to select a row
  91 ROW_BITS       = log2_int(BRAM_ROWS)
  92 # ROW_LINE_BITS is the number of bits to select a row within a line
  93 ROW_LINE_BITS  = log2_int(ROW_PER_LINE)
  94 # LINE_OFF_BITS is the number of bits for the offset in a cache line
  95 LINE_OFF_BITS  = log2_int(LINE_SIZE)
  96 # ROW_OFF_BITS is the number of bits for the offset in a row
  97 ROW_OFF_BITS   = log2_int(ROW_SIZE)
  98 # INDEX_BITS is the number of bits to select a cache line
  99 INDEX_BITS     = log2_int(NUM_LINES)
 100 # SET_SIZE_BITS is the log base 2 of the set size
 101 SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
 102 # TAG_BITS is the number of bits of the tag part of the address
 103 TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
 104 # TAG_WIDTH is the width in bits of each way of the tag RAM
 105 TAG_WIDTH      = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 106
 107 # WAY_BITS is the number of bits to select a way
 108 WAY_BITS       = log2_int(NUM_WAYS)
 109 TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
 110
 111 # L1 ITLB
 112 TLB_BITS        = log2_int(TLB_SIZE)
 113 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
 114 TLB_PTE_BITS    = 64
 115
 116 print("BRAM_ROWS       =", BRAM_ROWS)
 117 print("INDEX_BITS      =", INDEX_BITS)
 118 print("INSN_BITS       =", INSN_BITS)
 119 print("INSN_PER_ROW    =", INSN_PER_ROW)
 120 print("LINE_SIZE       =", LINE_SIZE)
 121 print("LINE_OFF_BITS   =", LINE_OFF_BITS)
 122 print("LOG_LENGTH      =", LOG_LENGTH)
 123 print("NUM_LINES       =", NUM_LINES)
 124 print("NUM_WAYS        =", NUM_WAYS)
 125 print("REAL_ADDR_BITS  =", REAL_ADDR_BITS)
 126 print("ROW_BITS        =", ROW_BITS)
 127 print("ROW_OFF_BITS    =", ROW_OFF_BITS)
 128 print("ROW_LINE_BITS   =", ROW_LINE_BITS)
 129 print("ROW_PER_LINE    =", ROW_PER_LINE)
 130 print("ROW_SIZE        =", ROW_SIZE)
 131 print("ROW_SIZE_BITS   =", ROW_SIZE_BITS)
 132 print("SET_SIZE_BITS   =", SET_SIZE_BITS)
 133 print("SIM             =", SIM)
 134 print("TAG_BITS        =", TAG_BITS)
 135 print("TAG_RAM_WIDTH   =", TAG_RAM_WIDTH)
 136 print("TAG_BITS        =", TAG_BITS)
 137 print("TLB_BITS        =", TLB_BITS)
 138 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
 139 print("TLB_LG_PGSZ     =", TLB_LG_PGSZ)
 140 print("TLB_PTE_BITS    =", TLB_PTE_BITS)
 141 print("TLB_SIZE        =", TLB_SIZE)
 142 print("WAY_BITS        =", WAY_BITS)
 143
 144 # from microwatt/utils.vhdl
 145 def ispow2(n):
 146     return n != 0 and (n & (n - 1)) == 0
 147
 148 assert LINE_SIZE % ROW_SIZE == 0
 149 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 150 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 151 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 152 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
 153 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
 154     "geometry bits don't add up"
 155 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
 156    "geometry bits don't add up"
 157 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
 158     "geometry bits don't add up"
 159 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
 160     "geometry bits don't add up"
 161
 162 # Example of layout for 32 lines of 64 bytes:
 163 #
 164 # ..  tag    |index|  line  |
 165 # ..         |   row   |    |
 166 # ..         |     |   | |00| zero          (2)
 167 # ..         |     |   |-|  | INSN_BITS     (1)
 168 # ..         |     |---|    | ROW_LINE_BITS  (3)
 169 # ..         |     |--- - --| LINE_OFF_BITS (6)
 170 # ..         |         |- --| ROW_OFF_BITS  (3)
 171 # ..         |----- ---|    | ROW_BITS      (8)
 172 # ..         |-----|        | INDEX_BITS    (5)
 173 # .. --------|              | TAG_BITS      (53)
 174
 175 # The cache data BRAM organized as described above for each way
 176 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
 177 #
 178 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
 179 # not handle a clean (commented) definition of the cache tags as a 3d
 180 # memory. For now, work around it by putting all the tags
 181 def CacheTagArray():
 182     tag_layout = [('valid', 1),
 183                   ('tag', TAG_RAM_WIDTH),
 184                  ]
 185     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 186
 187 def RowPerLineValidArray():
 188     return Array(Signal(name="rows_valid_%d" %x) \
 189                  for x in range(ROW_PER_LINE))
 190
 191
 192 # TODO to be passed to nigmen as ram attributes
 193 # attribute ram_style : string;
 194 # attribute ram_style of cache_tags : signal is "distributed";
 195
 196 def TLBArray():
 197     tlb_layout = [('valid', 1),
 198                   ('tag', TLB_EA_TAG_BITS),
 199                   ('pte', TLB_PTE_BITS)
 200                  ]
 201     return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE))
 202
 203 # Cache RAM interface
 204 def CacheRamOut():
 205     return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
 206                  for x in range(NUM_WAYS))
 207
 208 # PLRU output interface
 209 def PLRUOut():
 210     return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
 211                  for x in range(NUM_LINES))
 212
 213 # Return the cache line index (tag index) for an address
 214 def get_index(addr):
 215     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 216
 217 # Return the cache row index (data memory) for an address
 218 def get_row(addr):
 219     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 220
 221 # Return the index of a row within a line
 222 def get_row_of_line(row):
 223     return row[:ROW_BITS][:ROW_LINE_BITS]
 224
 225 # Returns whether this is the last row of a line
 226 def is_last_row_addr(addr, last):
 227     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 228
 229 # Returns whether this is the last row of a line
 230 def is_last_row(row, last):
 231     return get_row_of_line(row) == last
 232
 233 # Return the next row in the current cache line. We use a dedicated
 234 # function in order to limit the size of the generated adder to be
 235 # only the bits within a cache line (3 bits with default settings)
 236 def next_row(row):
 237     row_v = row[0:ROW_LINE_BITS] + 1
 238     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 239
 240 # Read the instruction word for the given address
 241 # in the current cache row
 242 def read_insn_word(addr, data):
 243     word = addr[2:INSN_BITS+2]
 244     return data.word_select(word, 32)
 245
 246 # Get the tag value from the address
 247 def get_tag(addr):
 248     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 249
 250 # Read a tag from a tag memory row
 251 def read_tag(way, tagset):
 252     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 253
 254 # Write a tag to tag memory row
 255 def write_tag(way, tagset, tag):
 256     return read_tag(way, tagset).eq(tag)
 257
 258 # Simple hash for direct-mapped TLB index
 259 def hash_ea(addr):
 260     hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
 261            TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
 262           ] ^ addr[
 263            TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
 264           ]
 265     return hsh
 266
 267
 268 # Cache reload state machine
 269 @unique
 270 class State(Enum):
 271     IDLE     = 0
 272     CLR_TAG  = 1
 273     WAIT_ACK = 2
 274
 275
 276 class RegInternal(RecordObject):
 277     def __init__(self):
 278         super().__init__()
 279         # Cache hit state (Latches for 1 cycle BRAM access)
 280         self.hit_way      = Signal(WAY_BITS)
 281         self.hit_nia      = Signal(64)
 282         self.hit_smark    = Signal()
 283         self.hit_valid    = Signal()
 284
 285         # Cache miss state (reload state machine)
 286         self.state        = Signal(State, reset=State.IDLE)
 287         self.wb           = WBMasterOut("wb")
 288         self.req_adr      = Signal(64)
 289         self.store_way    = Signal(WAY_BITS)
 290         self.store_index  = Signal(INDEX_BITS)
 291         self.store_row    = Signal(ROW_BITS)
 292         self.store_tag    = Signal(TAG_BITS)
 293         self.store_valid  = Signal()
 294         self.end_row_ix   = Signal(ROW_LINE_BITS)
 295         self.rows_valid   = RowPerLineValidArray()
 296
 297         # TLB miss state
 298         self.fetch_failed = Signal()
 299
 300
 301 class ICache(Elaboratable):
 302     """64 bit direct mapped icache. All instructions are 4B aligned."""
 303     def __init__(self):
 304         self.i_in           = Fetch1ToICacheType(name="i_in")
 305         self.i_out          = ICacheToDecode1Type(name="i_out")
 306
 307         self.m_in           = MMUToICacheType(name="m_in")
 308
 309         self.stall_in       = Signal()
 310         self.stall_out      = Signal()
 311         self.flush_in       = Signal()
 312         self.inval_in       = Signal()
 313
 314         # standard naming (wired to non-standard for compatibility)
 315         self.bus = Interface(addr_width=32,
 316                             data_width=64,
 317                             granularity=8,
 318                             features={'stall'},
 319                             alignment=0,
 320                             name="dcache")
 321
 322         self.log_out        = Signal(54)
 323
 324
 325     # Generate a cache RAM for each way
 326     def rams(self, m, r, cache_out_row, use_previous,
 327              replace_way, req_row):
 328
 329         comb = m.d.comb
 330         sync = m.d.sync
 331
 332         bus, stall_in = self.bus, self.stall_in
 333
 334         # read condition (for every cache ram)
 335         do_read  = Signal()
 336         comb += do_read.eq(~(stall_in | use_previous))
 337
 338         rd_addr  = Signal(ROW_BITS)
 339         wr_addr  = Signal(ROW_BITS)
 340         comb += rd_addr.eq(req_row)
 341         comb += wr_addr.eq(r.store_row)
 342
 343         # binary-to-unary converters: replace-way enabled by bus.ack,
 344         # hit-way left permanently enabled
 345         m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
 346         m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
 347         comb += re.i.eq(replace_way)
 348         comb += re.n.eq(~bus.ack)
 349         comb += he.i.eq(r.hit_way)
 350
 351         for i in range(NUM_WAYS):
 352             do_write = Signal(name="do_wr_%d" % i)
 353             d_out    = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
 354             wr_sel   = Signal(ROW_SIZE, name="wr_sel_%d" % i)
 355
 356             way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
 357             m.submodules["cacheram_%d" % i] =  way
 358
 359             comb += way.rd_en.eq(do_read)
 360             comb += way.rd_addr.eq(rd_addr)
 361             comb += d_out.eq(way.rd_data_o)
 362             comb += way.wr_sel.eq(wr_sel)
 363             comb += way.wr_addr.eq(wr_addr)
 364             comb += way.wr_data.eq(bus.dat_r)
 365
 366             comb += do_write.eq(re.o[i])
 367
 368             with m.If(do_write):
 369                 sync += Display("cache write adr: %x data: %lx",
 370                                 wr_addr, way.wr_data)
 371
 372             with m.If(he.o[i]):
 373                 comb += cache_out_row.eq(d_out)
 374                 with m.If(do_read):
 375                     sync += Display("cache read adr: %x data: %x",
 376                                      req_row, d_out)
 377
 378             comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
 379
 380     # Generate PLRUs
 381     def maybe_plrus(self, m, r, plru_victim):
 382         comb = m.d.comb
 383
 384         if NUM_WAYS == 0:
 385             return
 386
 387         m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS, plru_victim)
 388         comb += plru.way.eq(r.hit_way)
 389         comb += plru.valid.eq(r.hit_valid)
 390         comb += plru.index.eq(get_index(r.hit_nia))
 391
 392     # TLB hit detection and real address generation
 393     def itlb_lookup(self, m, tlb_req_index, itlb,
 394                     real_addr, ra_valid, eaa_priv,
 395                     priv_fault, access_ok):
 396
 397         comb = m.d.comb
 398
 399         i_in = self.i_in
 400
 401         pte  = Signal(TLB_PTE_BITS)
 402         ttag = Signal(TLB_EA_TAG_BITS)
 403
 404         comb += tlb_req_index.eq(hash_ea(i_in.nia))
 405         comb += pte.eq(itlb[tlb_req_index].pte)
 406         comb += ttag.eq(itlb[tlb_req_index].tag)
 407
 408         with m.If(i_in.virt_mode):
 409             comb += real_addr.eq(Cat(
 410                      i_in.nia[:TLB_LG_PGSZ],
 411                      pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
 412                     ))
 413
 414             with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
 415                 comb += ra_valid.eq(itlb[tlb_req_index].valid)
 416
 417             comb += eaa_priv.eq(pte[3])
 418
 419         with m.Else():
 420             comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
 421             comb += ra_valid.eq(1)
 422             comb += eaa_priv.eq(1)
 423
 424         # No IAMR, so no KUEP support for now
 425         comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
 426         comb += access_ok.eq(ra_valid & ~priv_fault)
 427
 428     # iTLB update
 429     def itlb_update(self, m, itlb):
 430         comb = m.d.comb
 431         sync = m.d.sync
 432
 433         m_in = self.m_in
 434
 435         wr_index = Signal(TLB_SIZE)
 436         comb += wr_index.eq(hash_ea(m_in.addr))
 437
 438         with m.If(m_in.tlbie & m_in.doall):
 439             # Clear all valid bits
 440             for i in range(TLB_SIZE):
 441                 sync += itlb[i].valid.eq(0)
 442
 443         with m.Elif(m_in.tlbie):
 444             # Clear entry regardless of hit or miss
 445             sync += itlb[wr_index].valid.eq(0)
 446
 447         with m.Elif(m_in.tlbld):
 448             sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
 449             sync += itlb[wr_index].pte.eq(m_in.pte)
 450             sync += itlb[wr_index].valid.eq(1)
 451
 452     # Cache hit detection, output to fetch2 and other misc logic
 453     def icache_comb(self, m, use_previous, r, req_index, req_row,
 454                     req_hit_way, req_tag, real_addr, req_laddr,
 455                     cache_tags, access_ok,
 456                     req_is_hit, req_is_miss, replace_way,
 457                     plru_victim, cache_out_row):
 458
 459         comb = m.d.comb
 460
 461         i_in, i_out, bus = self.i_in, self.i_out, self.bus
 462         flush_in, stall_out = self.flush_in, self.stall_out
 463
 464         is_hit  = Signal()
 465         hit_way = Signal(WAY_BITS)
 466
 467         # i_in.sequential means that i_in.nia this cycle is 4 more than
 468         # last cycle.  If we read more than 32 bits at a time, had a
 469         # cache hit last cycle, and we don't want the first 32-bit chunk
 470         # then we can keep the data we read last cycle and just use that.
 471         with m.If(i_in.nia[2:INSN_BITS+2] != 0):
 472             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 473
 474         # Extract line, row and tag from request
 475         comb += req_index.eq(get_index(i_in.nia))
 476         comb += req_row.eq(get_row(i_in.nia))
 477         comb += req_tag.eq(get_tag(real_addr))
 478
 479         # Calculate address of beginning of cache row, will be
 480         # used for cache miss processing if needed
 481         comb += req_laddr.eq(Cat(
 482                  Const(0, ROW_OFF_BITS),
 483                  real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
 484                 ))
 485
 486         # Test if pending request is a hit on any way
 487         hitcond = Signal()
 488         comb += hitcond.eq((r.state == State.WAIT_ACK)
 489                  & (req_index == r.store_index)
 490                  & r.rows_valid[req_row % ROW_PER_LINE]
 491                 )
 492         # i_in.req asserts Decoder active
 493         cvb = Signal(NUM_WAYS)
 494         ctag = Signal(TAG_RAM_WIDTH)
 495         comb += ctag.eq(cache_tags[req_index].tag)
 496         comb += cvb.eq(cache_tags[req_index].valid)
 497         m.submodules.store_way_e = se = Decoder(NUM_WAYS)
 498         comb += se.i.eq(r.store_way)
 499         comb += se.n.eq(~i_in.req)
 500         for i in range(NUM_WAYS):
 501             tagi = Signal(TAG_BITS, name="tag_i%d" % i)
 502             hit_test = Signal(name="hit_test%d" % i)
 503             is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 504             comb += tagi.eq(read_tag(i, ctag))
 505             comb += hit_test.eq(se.o[i])
 506             comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
 507                                   (tagi == req_tag))
 508             with m.If(is_tag_hit):
 509                 comb += hit_way.eq(i)
 510                 comb += is_hit.eq(1)
 511
 512         # Generate the "hit" and "miss" signals
 513         # for the synchronous blocks
 514         with m.If(i_in.req & access_ok & ~flush_in):
 515             comb += req_is_hit.eq(is_hit)
 516             comb += req_is_miss.eq(~is_hit)
 517
 518         comb += req_hit_way.eq(hit_way)
 519
 520         # The way to replace on a miss
 521         with m.If(r.state == State.CLR_TAG):
 522             comb += replace_way.eq(plru_victim[r.store_index])
 523         with m.Else():
 524             comb += replace_way.eq(r.store_way)
 525
 526         # Output instruction from current cache row
 527         #
 528         # Note: This is a mild violation of our design principle of
 529         # having pipeline stages output from a clean latch. In this
 530         # case we output the result of a mux. The alternative would
 531         # be output an entire row which I prefer not to do just yet
 532         # as it would force fetch2 to know about some of the cache
 533         # geometry information.
 534         comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
 535         comb += i_out.valid.eq(r.hit_valid)
 536         comb += i_out.nia.eq(r.hit_nia)
 537         comb += i_out.stop_mark.eq(r.hit_smark)
 538         comb += i_out.fetch_failed.eq(r.fetch_failed)
 539
 540         # Stall fetch1 if we have a miss on cache or TLB
 541         # or a protection fault
 542         comb += stall_out.eq(~(is_hit & access_ok))
 543
 544         # Wishbone requests output (from the cache miss reload machine)
 545         comb += bus.we.eq(r.wb.we)
 546         comb += bus.adr.eq(r.wb.adr)
 547         comb += bus.sel.eq(r.wb.sel)
 548         comb += bus.stb.eq(r.wb.stb)
 549         comb += bus.dat_w.eq(r.wb.dat)
 550         comb += bus.cyc.eq(r.wb.cyc)
 551
 552     # Cache hit synchronous machine
 553     def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
 554                    req_index, req_tag, real_addr):
 555         sync = m.d.sync
 556
 557         i_in, stall_in = self.i_in, self.stall_in
 558         flush_in       = self.flush_in
 559
 560         # keep outputs to fetch2 unchanged on a stall
 561         # except that flush or reset sets valid to 0
 562         # If use_previous, keep the same data as last
 563         # cycle and use the second half
 564         with m.If(stall_in | use_previous):
 565             with m.If(flush_in):
 566                 sync += r.hit_valid.eq(0)
 567         with m.Else():
 568             # On a hit, latch the request for the next cycle,
 569             # when the BRAM data will be available on the
 570             # cache_out output of the corresponding way
 571             sync += r.hit_valid.eq(req_is_hit)
 572
 573             with m.If(req_is_hit):
 574                 sync += r.hit_way.eq(req_hit_way)
 575                 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
 576                                 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
 577                                  i_in.stop_mark, req_index, req_tag,
 578                                  req_hit_way, real_addr)
 579
 580         with m.If(~stall_in):
 581             # Send stop marks and NIA down regardless of validity
 582             sync += r.hit_smark.eq(i_in.stop_mark)
 583             sync += r.hit_nia.eq(i_in.nia)
 584
 585     def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
 586                          req_index, req_tag, replace_way, real_addr):
 587         comb = m.d.comb
 588         sync = m.d.sync
 589
 590         i_in = self.i_in
 591
 592         # Reset per-row valid flags, only used in WAIT_ACK
 593         for i in range(ROW_PER_LINE):
 594             sync += r.rows_valid[i].eq(0)
 595
 596         # We need to read a cache line
 597         with m.If(req_is_miss):
 598             sync += Display(
 599                      "cache miss nia:%x IR:%x SM:%x idx:%x "
 600                      " way:%x tag:%x RA:%x", i_in.nia,
 601                      i_in.virt_mode, i_in.stop_mark, req_index,
 602                      replace_way, req_tag, real_addr)
 603
 604             # Keep track of our index and way for subsequent stores
 605             st_row = Signal(ROW_BITS)
 606             comb += st_row.eq(get_row(req_laddr))
 607             sync += r.store_index.eq(req_index)
 608             sync += r.store_row.eq(st_row)
 609             sync += r.store_tag.eq(req_tag)
 610             sync += r.store_valid.eq(1)
 611             sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
 612
 613             # Prep for first wishbone read.  We calculate the address
 614             # of the start of the cache line and start the WB cycle.
 615             sync += r.req_adr.eq(req_laddr)
 616             sync += r.wb.cyc.eq(1)
 617             sync += r.wb.stb.eq(1)
 618
 619             # Track that we had one request sent
 620             sync += r.state.eq(State.CLR_TAG)
 621
 622     def icache_miss_clr_tag(self, m, r, replace_way,
 623                             req_index,
 624                             tagset, cache_tags):
 625         comb = m.d.comb
 626         sync = m.d.sync
 627
 628         # Get victim way from plru
 629         sync += r.store_way.eq(replace_way)
 630
 631         # Force misses on that way while reloading that line
 632         cv = Signal(INDEX_BITS)
 633         comb += cv.eq(cache_tags[req_index].valid)
 634         comb += cv.bit_select(replace_way, 1).eq(0)
 635         sync += cache_tags[req_index].valid.eq(cv)
 636
 637         for i in range(NUM_WAYS):
 638             with m.If(i == replace_way):
 639                 comb += tagset.eq(cache_tags[r.store_index].tag)
 640                 comb += write_tag(i, tagset, r.store_tag)
 641                 sync += cache_tags[r.store_index].tag.eq(tagset)
 642
 643         sync += r.state.eq(State.WAIT_ACK)
 644
 645     def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
 646                              cache_tags, stbs_done):
 647         comb = m.d.comb
 648         sync = m.d.sync
 649
 650         bus = self.bus
 651
 652         # Requests are all sent if stb is 0
 653         stbs_zero = Signal()
 654         comb += stbs_zero.eq(r.wb.stb == 0)
 655         comb += stbs_done.eq(stbs_zero)
 656
 657         # If we are still sending requests, was one accepted?
 658         with m.If(~bus.stall & ~stbs_zero):
 659             # That was the last word? We are done sending.
 660             # Clear stb and set stbs_done so we can handle
 661             # an eventual last ack on the same cycle.
 662             with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
 663                 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
 664                          "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
 665                          "stbs_done:%x", r.wb.adr, r.end_row_ix,
 666                          r.wb.stb, stbs_zero, stbs_done)
 667                 sync += r.wb.stb.eq(0)
 668                 comb += stbs_done.eq(1)
 669
 670             # Calculate the next row address
 671             rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
 672             comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
 673             sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
 674             sync += Display("RARANGE r.req_adr:%x rarange:%x "
 675                             "stbs_zero:%x stbs_done:%x",
 676                             r.req_adr, rarange, stbs_zero, stbs_done)
 677
 678         # Incoming acks processing
 679         with m.If(bus.ack):
 680             sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
 681                             "stbs_done:%x",
 682                             bus.dat_r, stbs_zero, stbs_done)
 683
 684             sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
 685
 686             # Check for completion
 687             with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
 688                 # Complete wishbone cycle
 689                 sync += r.wb.cyc.eq(0)
 690                 # be nice, clear addr
 691                 sync += r.req_adr.eq(0)
 692
 693                 # Cache line is now valid
 694                 cv = Signal(INDEX_BITS)
 695                 comb += cv.eq(cache_tags[r.store_index].valid)
 696                 comb += cv.bit_select(replace_way, 1).eq(
 697                          r.store_valid & ~inval_in)
 698                 sync += cache_tags[r.store_index].valid.eq(cv)
 699
 700                 sync += r.state.eq(State.IDLE)
 701
 702             # move on to next request in row
 703             # Increment store row counter
 704             sync += r.store_row.eq(next_row(r.store_row))
 705
 706     # Cache miss/reload synchronous machine
 707     def icache_miss(self, m, r, req_is_miss,
 708                     req_index, req_laddr, req_tag, replace_way,
 709                     cache_tags, access_ok, real_addr):
 710         comb = m.d.comb
 711         sync = m.d.sync
 712
 713         i_in, bus, m_in  = self.i_in, self.bus, self.m_in
 714         stall_in, flush_in = self.stall_in, self.flush_in
 715         inval_in           = self.inval_in
 716
 717         tagset    = Signal(TAG_RAM_WIDTH)
 718         stbs_done = Signal()
 719
 720         comb += r.wb.sel.eq(-1)
 721         comb += r.wb.adr.eq(r.req_adr[3:])
 722
 723         # Process cache invalidations
 724         with m.If(inval_in):
 725             for i in range(NUM_LINES):
 726                 sync += cache_tags[i].valid.eq(0)
 727             sync += r.store_valid.eq(0)
 728
 729         # Main state machine
 730         with m.Switch(r.state):
 731
 732             with m.Case(State.IDLE):
 733                 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
 734                                       req_index, req_tag, replace_way,
 735                                       real_addr)
 736
 737             with m.Case(State.CLR_TAG, State.WAIT_ACK):
 738                 with m.If(r.state == State.CLR_TAG):
 739                     self.icache_miss_clr_tag(m, r, replace_way,
 740                                              req_index, tagset, cache_tags)
 741
 742                 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
 743                                           cache_tags, stbs_done)
 744
 745         # TLB miss and protection fault processing
 746         with m.If(flush_in | m_in.tlbld):
 747             sync += r.fetch_failed.eq(0)
 748         with m.Elif(i_in.req & ~access_ok & ~stall_in):
 749             sync += r.fetch_failed.eq(1)
 750
 751     # icache_log: if LOG_LENGTH > 0 generate
 752     def icache_log(self, m, req_hit_way, ra_valid, access_ok,
 753                    req_is_miss, req_is_hit, lway, wstate, r):
 754         comb = m.d.comb
 755         sync = m.d.sync
 756
 757         bus, i_out       = self.bus, self.i_out
 758         log_out, stall_out = self.log_out, self.stall_out
 759
 760         # Output data to logger
 761         for i in range(LOG_LENGTH):
 762             log_data = Signal(54)
 763             lway     = Signal(WAY_BITS)
 764             wstate   = Signal()
 765
 766             sync += lway.eq(req_hit_way)
 767             sync += wstate.eq(0)
 768
 769             with m.If(r.state != State.IDLE):
 770                 sync += wstate.eq(1)
 771
 772             sync += log_data.eq(Cat(
 773                      ra_valid, access_ok, req_is_miss, req_is_hit,
 774                      lway, wstate, r.hit_nia[2:6], r.fetch_failed,
 775                      stall_out, bus.stall, r.wb.cyc, r.wb.stb,
 776                      r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
 777                     ))
 778             comb += log_out.eq(log_data)
 779
 780     def elaborate(self, platform):
 781
 782         m                = Module()
 783         comb             = m.d.comb
 784
 785         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
 786         cache_tags       = CacheTagArray()
 787
 788         # TLB Array
 789         itlb            = TLBArray()
 790
 791         # TODO to be passed to nmigen as ram attributes
 792         # attribute ram_style of itlb_tags : signal is "distributed";
 793         # attribute ram_style of itlb_ptes : signal is "distributed";
 794
 795         # Privilege bit from PTE EAA field
 796         eaa_priv         = Signal()
 797
 798         r                = RegInternal()
 799
 800         # Async signal on incoming request
 801         req_index        = Signal(INDEX_BITS)
 802         req_row          = Signal(ROW_BITS)
 803         req_hit_way      = Signal(WAY_BITS)
 804         req_tag          = Signal(TAG_BITS)
 805         req_is_hit       = Signal()
 806         req_is_miss      = Signal()
 807         req_laddr        = Signal(64)
 808
 809         tlb_req_index    = Signal(TLB_SIZE)
 810         real_addr        = Signal(REAL_ADDR_BITS)
 811         ra_valid         = Signal()
 812         priv_fault       = Signal()
 813         access_ok        = Signal()
 814         use_previous     = Signal()
 815
 816         cache_out_row    = Signal(ROW_SIZE_BITS)
 817
 818         plru_victim      = PLRUOut()
 819         replace_way      = Signal(WAY_BITS)
 820
 821         # fake-up the wishbone stall signal to comply with pipeline mode
 822         # same thing is done in dcache.py
 823         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
 824
 825         # call sub-functions putting everything together,
 826         # using shared signals established above
 827         self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
 828         self.maybe_plrus(m, r, plru_victim)
 829         self.itlb_lookup(m, tlb_req_index, itlb, real_addr,
 830                          ra_valid, eaa_priv, priv_fault,
 831                          access_ok)
 832         self.itlb_update(m, itlb)
 833         self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
 834                          req_tag, real_addr, req_laddr,
 835                          cache_tags, access_ok, req_is_hit, req_is_miss,
 836                          replace_way, plru_victim, cache_out_row)
 837         self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
 838                         req_index, req_tag, real_addr)
 839         self.icache_miss(m, r, req_is_miss, req_index,
 840                          req_laddr, req_tag, replace_way, cache_tags,
 841                          access_ok, real_addr)
 842         #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
 843         #                req_is_miss, req_is_hit, lway, wstate, r)
 844
 845         return m
 846
 847
 848 def icache_sim(dut):
 849     i_in = dut.i_in
 850     i_out  = dut.i_out
 851     m_out = dut.m_in
 852
 853     yield i_in.priv_mode.eq(1)
 854     yield i_in.req.eq(0)
 855     yield i_in.nia.eq(0)
 856     yield i_in.stop_mark.eq(0)
 857     yield m_out.tlbld.eq(0)
 858     yield m_out.tlbie.eq(0)
 859     yield m_out.addr.eq(0)
 860     yield m_out.pte.eq(0)
 861     yield
 862     yield
 863     yield
 864     yield
 865
 866     # miss, stalls for a bit
 867     yield i_in.req.eq(1)
 868     yield i_in.nia.eq(Const(0x0000000000000004, 64))
 869     yield
 870     valid = yield i_out.valid
 871     while not valid:
 872         yield
 873         valid = yield i_out.valid
 874     yield i_in.req.eq(0)
 875
 876     insn  = yield i_out.insn
 877     nia   = yield i_out.nia
 878     assert insn == 0x00000001, \
 879         "insn @%x=%x expected 00000001" % (nia, insn)
 880     yield i_in.req.eq(0)
 881     yield
 882
 883     # hit
 884     yield i_in.req.eq(1)
 885     yield i_in.nia.eq(Const(0x0000000000000008, 64))
 886     yield
 887     valid = yield i_out.valid
 888     while not valid:
 889         yield
 890         valid = yield i_out.valid
 891     yield i_in.req.eq(0)
 892
 893     nia   = yield i_out.nia
 894     insn  = yield i_out.insn
 895     yield
 896     assert insn == 0x00000002, \
 897         "insn @%x=%x expected 00000002" % (nia, insn)
 898
 899     # another miss
 900     yield i_in.req.eq(1)
 901     yield i_in.nia.eq(Const(0x0000000000000040, 64))
 902     yield
 903     valid = yield i_out.valid
 904     while not valid:
 905         yield
 906         valid = yield i_out.valid
 907     yield i_in.req.eq(0)
 908
 909     nia   = yield i_in.nia
 910     insn  = yield i_out.insn
 911     assert insn == 0x00000010, \
 912         "insn @%x=%x expected 00000010" % (nia, insn)
 913
 914     # test something that aliases (this only works because
 915     # the unit test SRAM is a depth of 512)
 916     yield i_in.req.eq(1)
 917     yield i_in.nia.eq(Const(0x0000000000000100, 64))
 918     yield
 919     yield
 920     valid = yield i_out.valid
 921     assert ~valid
 922     for i in range(30):
 923         yield
 924     yield
 925     insn  = yield i_out.insn
 926     valid = yield i_out.valid
 927     insn  = yield i_out.insn
 928     assert valid
 929     assert insn == 0x00000040, \
 930          "insn @%x=%x expected 00000040" % (nia, insn)
 931     yield i_in.req.eq(0)
 932
 933
 934 def test_icache(mem):
 935      dut    = ICache()
 936
 937      memory = Memory(width=64, depth=512, init=mem)
 938      sram   = SRAM(memory=memory, granularity=8)
 939
 940      m      = Module()
 941
 942      m.submodules.icache = dut
 943      m.submodules.sram   = sram
 944
 945      m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
 946      m.d.comb += sram.bus.stb.eq(dut.bus.stb)
 947      m.d.comb += sram.bus.we.eq(dut.bus.we)
 948      m.d.comb += sram.bus.sel.eq(dut.bus.sel)
 949      m.d.comb += sram.bus.adr.eq(dut.bus.adr)
 950      m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
 951
 952      m.d.comb += dut.bus.ack.eq(sram.bus.ack)
 953      m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
 954
 955      # nmigen Simulation
 956      sim = Simulator(m)
 957      sim.add_clock(1e-6)
 958
 959      sim.add_sync_process(wrap(icache_sim(dut)))
 960      with sim.write_vcd('test_icache.vcd'):
 961          sim.run()
 962
 963
 964 if __name__ == '__main__':
 965     dut = ICache()
 966     vl = rtlil.convert(dut, ports=[])
 967     with open("test_icache.il", "w") as f:
 968         f.write(vl)
 969
 970     # set up memory every 32-bits with incrementing values 0 1 2 ...
 971     mem = []
 972     for i in range(512):
 973         mem.append((i*2) | ((i*2+1)<<32))
 974
 975     test_icache(mem)