src/soc/experiment/icache.py

   1 """ICache
   2
   3 based on Anton Blanchard microwatt icache.vhdl
   4
   5 Set associative icache
   6
   7 TODO (in no specific order):
   8 * Add debug interface to inspect cache content
   9 * Add snoop/invalidate path
  10 * Add multi-hit error detection
  11 * Pipelined bus interface (wb or axi)
  12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
  13 * Add optimization: service hits on partially loaded lines
  14 * Add optimization: (maybe) interrupt reload on fluch/redirect
  15 * Check if playing with the geometry of the cache tags allow for more
  16   efficient use of distributed RAM and less logic/muxes. Currently we
  17   write TAG_BITS width which may not match full ram blocks and might
  18   cause muxes to be inferred for "partial writes".
  19 * Check if making the read size of PLRU a ROM helps utilization
  20 """
  21
  22 from enum import (Enum, unique)
  23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
  24                     Record)
  25 from nmigen.cli import main, rtlil
  26 from nmutil.iocontrol import RecordObject
  27 from nmigen.utils import log2_int
  28 from nmigen.lib.coding import Decoder
  29 from nmutil.util import Display
  30
  31 #from nmutil.plru import PLRU
  32 from soc.experiment.cache_ram import CacheRam
  33 from soc.experiment.plru import PLRU
  34
  35 from soc.experiment.mem_types import (Fetch1ToICacheType,
  36                                       ICacheToDecode1Type,
  37                                       MMUToICacheType)
  38
  39 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
  40                                      WB_SEL_BITS, WBAddrType, WBDataType,
  41                                      WBSelType, WBMasterOut, WBSlaveOut,
  42                                      )
  43
  44 from nmigen_soc.wishbone.bus import Interface
  45
  46 # for test
  47 from soc.bus.sram import SRAM
  48 from nmigen import Memory
  49 from nmutil.util import wrap
  50 from nmigen.cli import main, rtlil
  51
  52 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  53 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  54 from nmutil.sim_tmp_alternative import Simulator, Settle
  55
  56
  57 SIM            = 0
  58 LINE_SIZE      = 64
  59 # BRAM organisation: We never access more than wishbone_data_bits
  60 # at a time so to save resources we make the array only that wide,
  61 # and use consecutive indices for to make a cache "line"
  62 #
  63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
  64 ROW_SIZE       = WB_DATA_BITS // 8
  65 # Number of lines in a set
  66 NUM_LINES      = 16
  67 # Number of ways
  68 NUM_WAYS       = 4
  69 # L1 ITLB number of entries (direct mapped)
  70 TLB_SIZE       = 64
  71 # L1 ITLB log_2(page_size)
  72 TLB_LG_PGSZ    = 12
  73 # Number of real address bits that we store
  74 REAL_ADDR_BITS = 56
  75 # Non-zero to enable log data collection
  76 LOG_LENGTH     = 0
  77
  78 ROW_SIZE_BITS  = ROW_SIZE * 8
  79 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
  80 ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
  81 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
  82 BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
  83 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
  84 INSN_PER_ROW   = ROW_SIZE_BITS // 32
  85
  86 # Bit fields counts in the address
  87 #
  88 # INSN_BITS is the number of bits to select an instruction in a row
  89 INSN_BITS      = log2_int(INSN_PER_ROW)
  90 # ROW_BITS is the number of bits to select a row
  91 ROW_BITS       = log2_int(BRAM_ROWS)
  92 # ROW_LINE_BITS is the number of bits to select a row within a line
  93 ROW_LINE_BITS  = log2_int(ROW_PER_LINE)
  94 # LINE_OFF_BITS is the number of bits for the offset in a cache line
  95 LINE_OFF_BITS  = log2_int(LINE_SIZE)
  96 # ROW_OFF_BITS is the number of bits for the offset in a row
  97 ROW_OFF_BITS   = log2_int(ROW_SIZE)
  98 # INDEX_BITS is the number of bits to select a cache line
  99 INDEX_BITS     = log2_int(NUM_LINES)
 100 # SET_SIZE_BITS is the log base 2 of the set size
 101 SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
 102 # TAG_BITS is the number of bits of the tag part of the address
 103 TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
 104 # TAG_WIDTH is the width in bits of each way of the tag RAM
 105 TAG_WIDTH      = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 106
 107 # WAY_BITS is the number of bits to select a way
 108 WAY_BITS       = log2_int(NUM_WAYS)
 109 TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
 110
 111 # L1 ITLB
 112 TLB_BITS        = log2_int(TLB_SIZE)
 113 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
 114 TLB_PTE_BITS    = 64
 115
 116 print("BRAM_ROWS       =", BRAM_ROWS)
 117 print("INDEX_BITS      =", INDEX_BITS)
 118 print("INSN_BITS       =", INSN_BITS)
 119 print("INSN_PER_ROW    =", INSN_PER_ROW)
 120 print("LINE_SIZE       =", LINE_SIZE)
 121 print("LINE_OFF_BITS   =", LINE_OFF_BITS)
 122 print("LOG_LENGTH      =", LOG_LENGTH)
 123 print("NUM_LINES       =", NUM_LINES)
 124 print("NUM_WAYS        =", NUM_WAYS)
 125 print("REAL_ADDR_BITS  =", REAL_ADDR_BITS)
 126 print("ROW_BITS        =", ROW_BITS)
 127 print("ROW_OFF_BITS    =", ROW_OFF_BITS)
 128 print("ROW_LINE_BITS   =", ROW_LINE_BITS)
 129 print("ROW_PER_LINE    =", ROW_PER_LINE)
 130 print("ROW_SIZE        =", ROW_SIZE)
 131 print("ROW_SIZE_BITS   =", ROW_SIZE_BITS)
 132 print("SET_SIZE_BITS   =", SET_SIZE_BITS)
 133 print("SIM             =", SIM)
 134 print("TAG_BITS        =", TAG_BITS)
 135 print("TAG_RAM_WIDTH   =", TAG_RAM_WIDTH)
 136 print("TAG_BITS        =", TAG_BITS)
 137 print("TLB_BITS        =", TLB_BITS)
 138 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
 139 print("TLB_LG_PGSZ     =", TLB_LG_PGSZ)
 140 print("TLB_PTE_BITS    =", TLB_PTE_BITS)
 141 print("TLB_SIZE        =", TLB_SIZE)
 142 print("WAY_BITS        =", WAY_BITS)
 143
 144 # from microwatt/utils.vhdl
 145 def ispow2(n):
 146     return n != 0 and (n & (n - 1)) == 0
 147
 148 assert LINE_SIZE % ROW_SIZE == 0
 149 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 150 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 151 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 152 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
 153 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
 154     "geometry bits don't add up"
 155 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
 156    "geometry bits don't add up"
 157 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
 158     "geometry bits don't add up"
 159 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
 160     "geometry bits don't add up"
 161
 162 # Example of layout for 32 lines of 64 bytes:
 163 #
 164 # ..  tag    |index|  line  |
 165 # ..         |   row   |    |
 166 # ..         |     |   | |00| zero          (2)
 167 # ..         |     |   |-|  | INSN_BITS     (1)
 168 # ..         |     |---|    | ROW_LINE_BITS  (3)
 169 # ..         |     |--- - --| LINE_OFF_BITS (6)
 170 # ..         |         |- --| ROW_OFF_BITS  (3)
 171 # ..         |----- ---|    | ROW_BITS      (8)
 172 # ..         |-----|        | INDEX_BITS    (5)
 173 # .. --------|              | TAG_BITS      (53)
 174
 175 # The cache data BRAM organized as described above for each way
 176 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
 177 #
 178 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
 179 # not handle a clean (commented) definition of the cache tags as a 3d
 180 # memory. For now, work around it by putting all the tags
 181 def CacheTagArray():
 182     tag_layout = [('valid', 1),
 183                   ('tag', TAG_RAM_WIDTH),
 184                  ]
 185     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 186
 187 def RowPerLineValidArray():
 188     return Array(Signal(name="rows_valid_%d" %x) \
 189                  for x in range(ROW_PER_LINE))
 190
 191
 192 # TODO to be passed to nigmen as ram attributes
 193 # attribute ram_style : string;
 194 # attribute ram_style of cache_tags : signal is "distributed";
 195
 196 def TLBArray():
 197     tlb_layout = [('valid', 1),
 198                   ('tag', TLB_EA_TAG_BITS),
 199                   ('pte', TLB_PTE_BITS)
 200                  ]
 201     return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE))
 202
 203 # Cache RAM interface
 204 def CacheRamOut():
 205     return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
 206                  for x in range(NUM_WAYS))
 207
 208 # PLRU output interface
 209 def PLRUOut():
 210     return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
 211                  for x in range(NUM_LINES))
 212
 213 # Return the cache line index (tag index) for an address
 214 def get_index(addr):
 215     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 216
 217 # Return the cache row index (data memory) for an address
 218 def get_row(addr):
 219     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 220
 221 # Return the index of a row within a line
 222 def get_row_of_line(row):
 223     return row[:ROW_BITS][:ROW_LINE_BITS]
 224
 225 # Returns whether this is the last row of a line
 226 def is_last_row_addr(addr, last):
 227     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 228
 229 # Returns whether this is the last row of a line
 230 def is_last_row(row, last):
 231     return get_row_of_line(row) == last
 232
 233 # Return the next row in the current cache line. We use a dedicated
 234 # function in order to limit the size of the generated adder to be
 235 # only the bits within a cache line (3 bits with default settings)
 236 def next_row(row):
 237     row_v = row[0:ROW_LINE_BITS] + 1
 238     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 239
 240 # Read the instruction word for the given address
 241 # in the current cache row
 242 def read_insn_word(addr, data):
 243     word = addr[2:INSN_BITS+2]
 244     return data.word_select(word, 32)
 245
 246 # Get the tag value from the address
 247 def get_tag(addr):
 248     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 249
 250 # Read a tag from a tag memory row
 251 def read_tag(way, tagset):
 252     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 253
 254 # Write a tag to tag memory row
 255 def write_tag(way, tagset, tag):
 256     return read_tag(way, tagset).eq(tag)
 257
 258 # Simple hash for direct-mapped TLB index
 259 def hash_ea(addr):
 260     hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
 261            TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
 262           ] ^ addr[
 263            TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
 264           ]
 265     return hsh
 266
 267
 268 # Cache reload state machine
 269 @unique
 270 class State(Enum):
 271     IDLE     = 0
 272     CLR_TAG  = 1
 273     WAIT_ACK = 2
 274
 275
 276 class RegInternal(RecordObject):
 277     def __init__(self):
 278         super().__init__()
 279         # Cache hit state (Latches for 1 cycle BRAM access)
 280         self.hit_way      = Signal(WAY_BITS)
 281         self.hit_nia      = Signal(64)
 282         self.hit_smark    = Signal()
 283         self.hit_valid    = Signal()
 284
 285         # Cache miss state (reload state machine)
 286         self.state        = Signal(State, reset=State.IDLE)
 287         self.wb           = WBMasterOut("wb")
 288         self.req_adr      = Signal(64)
 289         self.store_way    = Signal(WAY_BITS)
 290         self.store_index  = Signal(INDEX_BITS)
 291         self.store_row    = Signal(ROW_BITS)
 292         self.store_tag    = Signal(TAG_BITS)
 293         self.store_valid  = Signal()
 294         self.end_row_ix   = Signal(ROW_LINE_BITS)
 295         self.rows_valid   = RowPerLineValidArray()
 296
 297         # TLB miss state
 298         self.fetch_failed = Signal()
 299
 300
 301 class ICache(Elaboratable):
 302     """64 bit direct mapped icache. All instructions are 4B aligned."""
 303     def __init__(self):
 304         self.i_in           = Fetch1ToICacheType(name="i_in")
 305         self.i_out          = ICacheToDecode1Type(name="i_out")
 306
 307         self.m_in           = MMUToICacheType(name="m_in")
 308
 309         self.stall_in       = Signal()
 310         self.stall_out      = Signal()
 311         self.flush_in       = Signal()
 312         self.inval_in       = Signal()
 313
 314         # standard naming (wired to non-standard for compatibility)
 315         self.bus = Interface(addr_width=32,
 316                             data_width=64,
 317                             granularity=8,
 318                             features={'stall'},
 319                             alignment=0,
 320                             name="dcache")
 321
 322         self.log_out        = Signal(54)
 323
 324
 325     # Generate a cache RAM for each way
 326     def rams(self, m, r, cache_out_row, use_previous,
 327              replace_way, req_row):
 328
 329         comb = m.d.comb
 330         sync = m.d.sync
 331
 332         bus, stall_in = self.bus, self.stall_in
 333
 334         # read condition (for every cache ram)
 335         do_read  = Signal()
 336         comb += do_read.eq(~(stall_in | use_previous))
 337
 338         # binary-to-unary converters: replace-way enabled by bus.ack,
 339         # hit-way left permanently enabled
 340         m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
 341         m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
 342         comb += re.i.eq(replace_way)
 343         comb += re.n.eq(~bus.ack)
 344         comb += he.i.eq(r.hit_way)
 345
 346         for i in range(NUM_WAYS):
 347             do_write = Signal(name="do_wr_%d" % i)
 348             rd_addr  = Signal(ROW_BITS)
 349             wr_addr  = Signal(ROW_BITS)
 350             d_out    = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
 351             wr_sel   = Signal(ROW_SIZE)
 352
 353             way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
 354             m.submodules["cacheram_%d" % i] =  way
 355
 356             comb += way.rd_en.eq(do_read)
 357             comb += way.rd_addr.eq(rd_addr)
 358             comb += d_out.eq(way.rd_data_o)
 359             comb += way.wr_sel.eq(wr_sel)
 360             comb += way.wr_addr.eq(wr_addr)
 361             comb += way.wr_data.eq(bus.dat_r)
 362
 363             comb += do_write.eq(re.o[i])
 364
 365             with m.If(do_write):
 366                 sync += Display("cache write adr: %x data: %lx",
 367                                 wr_addr, way.wr_data)
 368
 369             with m.If(he.o[i]):
 370                 comb += cache_out_row.eq(d_out)
 371                 with m.If(do_read):
 372                     sync += Display("cache read adr: %x data: %x",
 373                                      req_row, d_out)
 374
 375             comb += rd_addr.eq(req_row)
 376             comb += wr_addr.eq(r.store_row)
 377             comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
 378
 379     # Generate PLRUs
 380     def maybe_plrus(self, m, r, plru_victim):
 381         comb = m.d.comb
 382
 383         with m.If(NUM_WAYS > 1):
 384             m.submodules.plru_e = e = Decoder(NUM_LINES)
 385             comb += e.i.eq(get_index(r.hit_nia))
 386
 387             for i in range(NUM_LINES):
 388                 plru        = PLRU(WAY_BITS)
 389                 m.submodules["plru_%d" % i] = plru
 390
 391                 # PLRU interface
 392                 with m.If(e.o[i]):
 393                     comb += plru.acc_en.eq(r.hit_valid)
 394
 395                 comb += plru.acc_i.eq(r.hit_way)
 396                 comb += plru_victim[i].eq(plru.lru_o)
 397
 398     # TLB hit detection and real address generation
 399     def itlb_lookup(self, m, tlb_req_index, itlb,
 400                     real_addr, ra_valid, eaa_priv,
 401                     priv_fault, access_ok):
 402
 403         comb = m.d.comb
 404
 405         i_in = self.i_in
 406
 407         pte  = Signal(TLB_PTE_BITS)
 408         ttag = Signal(TLB_EA_TAG_BITS)
 409
 410         comb += tlb_req_index.eq(hash_ea(i_in.nia))
 411         comb += pte.eq(itlb[tlb_req_index].pte)
 412         comb += ttag.eq(itlb[tlb_req_index].tag)
 413
 414         with m.If(i_in.virt_mode):
 415             comb += real_addr.eq(Cat(
 416                      i_in.nia[:TLB_LG_PGSZ],
 417                      pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
 418                     ))
 419
 420             with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
 421                 comb += ra_valid.eq(itlb[tlb_req_index].valid)
 422
 423             comb += eaa_priv.eq(pte[3])
 424
 425         with m.Else():
 426             comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
 427             comb += ra_valid.eq(1)
 428             comb += eaa_priv.eq(1)
 429
 430         # No IAMR, so no KUEP support for now
 431         comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
 432         comb += access_ok.eq(ra_valid & ~priv_fault)
 433
 434     # iTLB update
 435     def itlb_update(self, m, itlb):
 436         comb = m.d.comb
 437         sync = m.d.sync
 438
 439         m_in = self.m_in
 440
 441         wr_index = Signal(TLB_SIZE)
 442         comb += wr_index.eq(hash_ea(m_in.addr))
 443
 444         with m.If(m_in.tlbie & m_in.doall):
 445             # Clear all valid bits
 446             for i in range(TLB_SIZE):
 447                 sync += itlb[i].valid.eq(0)
 448
 449         with m.Elif(m_in.tlbie):
 450             # Clear entry regardless of hit or miss
 451             sync += itlb[wr_index].valid.eq(0)
 452
 453         with m.Elif(m_in.tlbld):
 454             sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
 455             sync += itlb[wr_index].pte.eq(m_in.pte)
 456             sync += itlb[wr_index].valid.eq(1)
 457
 458     # Cache hit detection, output to fetch2 and other misc logic
 459     def icache_comb(self, m, use_previous, r, req_index, req_row,
 460                     req_hit_way, req_tag, real_addr, req_laddr,
 461                     cache_tags, access_ok,
 462                     req_is_hit, req_is_miss, replace_way,
 463                     plru_victim, cache_out_row):
 464
 465         comb = m.d.comb
 466
 467         i_in, i_out, bus = self.i_in, self.i_out, self.bus
 468         flush_in, stall_out = self.flush_in, self.stall_out
 469
 470         is_hit  = Signal()
 471         hit_way = Signal(WAY_BITS)
 472
 473         # i_in.sequential means that i_in.nia this cycle is 4 more than
 474         # last cycle.  If we read more than 32 bits at a time, had a
 475         # cache hit last cycle, and we don't want the first 32-bit chunk
 476         # then we can keep the data we read last cycle and just use that.
 477         with m.If(i_in.nia[2:INSN_BITS+2] != 0):
 478             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 479
 480         # Extract line, row and tag from request
 481         comb += req_index.eq(get_index(i_in.nia))
 482         comb += req_row.eq(get_row(i_in.nia))
 483         comb += req_tag.eq(get_tag(real_addr))
 484
 485         # Calculate address of beginning of cache row, will be
 486         # used for cache miss processing if needed
 487         comb += req_laddr.eq(Cat(
 488                  Const(0, ROW_OFF_BITS),
 489                  real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
 490                 ))
 491
 492         # Test if pending request is a hit on any way
 493         hitcond = Signal()
 494         comb += hitcond.eq((r.state == State.WAIT_ACK)
 495                  & (req_index == r.store_index)
 496                  & r.rows_valid[req_row % ROW_PER_LINE]
 497                 )
 498         # i_in.req asserts Decoder active
 499         cvb = Signal(NUM_WAYS)
 500         ctag = Signal(TAG_RAM_WIDTH)
 501         comb += ctag.eq(cache_tags[req_index].tag)
 502         comb += cvb.eq(cache_tags[req_index].valid)
 503         m.submodules.store_way_e = se = Decoder(NUM_WAYS)
 504         comb += se.i.eq(r.store_way)
 505         comb += se.n.eq(~i_in.req)
 506         for i in range(NUM_WAYS):
 507             tagi = Signal(TAG_BITS, name="tag_i%d" % i)
 508             hit_test = Signal(name="hit_test%d" % i)
 509             is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 510             comb += tagi.eq(read_tag(i, ctag))
 511             comb += hit_test.eq(se.o[i])
 512             comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
 513                                   (tagi == req_tag))
 514             with m.If(is_tag_hit):
 515                 comb += hit_way.eq(i)
 516                 comb += is_hit.eq(1)
 517
 518         # Generate the "hit" and "miss" signals
 519         # for the synchronous blocks
 520         with m.If(i_in.req & access_ok & ~flush_in):
 521             comb += req_is_hit.eq(is_hit)
 522             comb += req_is_miss.eq(~is_hit)
 523
 524         comb += req_hit_way.eq(hit_way)
 525
 526         # The way to replace on a miss
 527         with m.If(r.state == State.CLR_TAG):
 528             comb += replace_way.eq(plru_victim[r.store_index])
 529         with m.Else():
 530             comb += replace_way.eq(r.store_way)
 531
 532         # Output instruction from current cache row
 533         #
 534         # Note: This is a mild violation of our design principle of
 535         # having pipeline stages output from a clean latch. In this
 536         # case we output the result of a mux. The alternative would
 537         # be output an entire row which I prefer not to do just yet
 538         # as it would force fetch2 to know about some of the cache
 539         # geometry information.
 540         comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
 541         comb += i_out.valid.eq(r.hit_valid)
 542         comb += i_out.nia.eq(r.hit_nia)
 543         comb += i_out.stop_mark.eq(r.hit_smark)
 544         comb += i_out.fetch_failed.eq(r.fetch_failed)
 545
 546         # Stall fetch1 if we have a miss on cache or TLB
 547         # or a protection fault
 548         comb += stall_out.eq(~(is_hit & access_ok))
 549
 550         # Wishbone requests output (from the cache miss reload machine)
 551         comb += bus.we.eq(r.wb.we)
 552         comb += bus.adr.eq(r.wb.adr)
 553         comb += bus.sel.eq(r.wb.sel)
 554         comb += bus.stb.eq(r.wb.stb)
 555         comb += bus.dat_w.eq(r.wb.dat)
 556         comb += bus.cyc.eq(r.wb.cyc)
 557
 558     # Cache hit synchronous machine
 559     def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
 560                    req_index, req_tag, real_addr):
 561         sync = m.d.sync
 562
 563         i_in, stall_in = self.i_in, self.stall_in
 564         flush_in       = self.flush_in
 565
 566         # keep outputs to fetch2 unchanged on a stall
 567         # except that flush or reset sets valid to 0
 568         # If use_previous, keep the same data as last
 569         # cycle and use the second half
 570         with m.If(stall_in | use_previous):
 571             with m.If(flush_in):
 572                 sync += r.hit_valid.eq(0)
 573         with m.Else():
 574             # On a hit, latch the request for the next cycle,
 575             # when the BRAM data will be available on the
 576             # cache_out output of the corresponding way
 577             sync += r.hit_valid.eq(req_is_hit)
 578
 579             with m.If(req_is_hit):
 580                 sync += r.hit_way.eq(req_hit_way)
 581                 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
 582                                 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
 583                                  i_in.stop_mark, req_index, req_tag,
 584                                  req_hit_way, real_addr)
 585
 586         with m.If(~stall_in):
 587             # Send stop marks and NIA down regardless of validity
 588             sync += r.hit_smark.eq(i_in.stop_mark)
 589             sync += r.hit_nia.eq(i_in.nia)
 590
 591     def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
 592                          req_index, req_tag, replace_way, real_addr):
 593         comb = m.d.comb
 594         sync = m.d.sync
 595
 596         i_in = self.i_in
 597
 598         # Reset per-row valid flags, only used in WAIT_ACK
 599         for i in range(ROW_PER_LINE):
 600             sync += r.rows_valid[i].eq(0)
 601
 602         # We need to read a cache line
 603         with m.If(req_is_miss):
 604             sync += Display(
 605                      "cache miss nia:%x IR:%x SM:%x idx:%x "
 606                      " way:%x tag:%x RA:%x", i_in.nia,
 607                      i_in.virt_mode, i_in.stop_mark, req_index,
 608                      replace_way, req_tag, real_addr)
 609
 610             # Keep track of our index and way for subsequent stores
 611             st_row = Signal(ROW_BITS)
 612             comb += st_row.eq(get_row(req_laddr))
 613             sync += r.store_index.eq(req_index)
 614             sync += r.store_row.eq(st_row)
 615             sync += r.store_tag.eq(req_tag)
 616             sync += r.store_valid.eq(1)
 617             sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
 618
 619             # Prep for first wishbone read.  We calculate the address
 620             # of the start of the cache line and start the WB cycle.
 621             sync += r.req_adr.eq(req_laddr)
 622             sync += r.wb.cyc.eq(1)
 623             sync += r.wb.stb.eq(1)
 624
 625             # Track that we had one request sent
 626             sync += r.state.eq(State.CLR_TAG)
 627
 628     def icache_miss_clr_tag(self, m, r, replace_way,
 629                             req_index,
 630                             tagset, cache_tags):
 631         comb = m.d.comb
 632         sync = m.d.sync
 633
 634         # Get victim way from plru
 635         sync += r.store_way.eq(replace_way)
 636
 637         # Force misses on that way while reloading that line
 638         cv = Signal(INDEX_BITS)
 639         comb += cv.eq(cache_tags[req_index].valid)
 640         comb += cv.bit_select(replace_way, 1).eq(0)
 641         sync += cache_tags[req_index].valid.eq(cv)
 642
 643         for i in range(NUM_WAYS):
 644             with m.If(i == replace_way):
 645                 comb += tagset.eq(cache_tags[r.store_index].tag)
 646                 comb += write_tag(i, tagset, r.store_tag)
 647                 sync += cache_tags[r.store_index].tag.eq(tagset)
 648
 649         sync += r.state.eq(State.WAIT_ACK)
 650
 651     def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
 652                              cache_tags, stbs_done):
 653         comb = m.d.comb
 654         sync = m.d.sync
 655
 656         bus = self.bus
 657
 658         # Requests are all sent if stb is 0
 659         stbs_zero = Signal()
 660         comb += stbs_zero.eq(r.wb.stb == 0)
 661         comb += stbs_done.eq(stbs_zero)
 662
 663         # If we are still sending requests, was one accepted?
 664         with m.If(~bus.stall & ~stbs_zero):
 665             # That was the last word? We are done sending.
 666             # Clear stb and set stbs_done so we can handle
 667             # an eventual last ack on the same cycle.
 668             with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
 669                 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
 670                          "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
 671                          "stbs_done:%x", r.wb.adr, r.end_row_ix,
 672                          r.wb.stb, stbs_zero, stbs_done)
 673                 sync += r.wb.stb.eq(0)
 674                 comb += stbs_done.eq(1)
 675
 676             # Calculate the next row address
 677             rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
 678             comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
 679             sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
 680             sync += Display("RARANGE r.req_adr:%x rarange:%x "
 681                             "stbs_zero:%x stbs_done:%x",
 682                             r.req_adr, rarange, stbs_zero, stbs_done)
 683
 684         # Incoming acks processing
 685         with m.If(bus.ack):
 686             sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
 687                             "stbs_done:%x",
 688                             bus.dat_r, stbs_zero, stbs_done)
 689
 690             sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
 691
 692             # Check for completion
 693             with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
 694                 # Complete wishbone cycle
 695                 sync += r.wb.cyc.eq(0)
 696                 # be nice, clear addr
 697                 sync += r.req_adr.eq(0)
 698
 699                 # Cache line is now valid
 700                 cv = Signal(INDEX_BITS)
 701                 comb += cv.eq(cache_tags[r.store_index].valid)
 702                 comb += cv.bit_select(replace_way, 1).eq(
 703                          r.store_valid & ~inval_in)
 704                 sync += cache_tags[r.store_index].valid.eq(cv)
 705
 706                 sync += r.state.eq(State.IDLE)
 707
 708             # move on to next request in row
 709             # Increment store row counter
 710             sync += r.store_row.eq(next_row(r.store_row))
 711
 712     # Cache miss/reload synchronous machine
 713     def icache_miss(self, m, r, req_is_miss,
 714                     req_index, req_laddr, req_tag, replace_way,
 715                     cache_tags, access_ok, real_addr):
 716         comb = m.d.comb
 717         sync = m.d.sync
 718
 719         i_in, bus, m_in  = self.i_in, self.bus, self.m_in
 720         stall_in, flush_in = self.stall_in, self.flush_in
 721         inval_in           = self.inval_in
 722
 723         tagset    = Signal(TAG_RAM_WIDTH)
 724         stbs_done = Signal()
 725
 726         comb += r.wb.sel.eq(-1)
 727         comb += r.wb.adr.eq(r.req_adr[3:])
 728
 729         # Process cache invalidations
 730         with m.If(inval_in):
 731             for i in range(NUM_LINES):
 732                 sync += cache_tags[i].valid.eq(0)
 733             sync += r.store_valid.eq(0)
 734
 735         # Main state machine
 736         with m.Switch(r.state):
 737
 738             with m.Case(State.IDLE):
 739                 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
 740                                       req_index, req_tag, replace_way,
 741                                       real_addr)
 742
 743             with m.Case(State.CLR_TAG, State.WAIT_ACK):
 744                 with m.If(r.state == State.CLR_TAG):
 745                     self.icache_miss_clr_tag(m, r, replace_way,
 746                                              req_index, tagset, cache_tags)
 747
 748                 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
 749                                           cache_tags, stbs_done)
 750
 751         # TLB miss and protection fault processing
 752         with m.If(flush_in | m_in.tlbld):
 753             sync += r.fetch_failed.eq(0)
 754         with m.Elif(i_in.req & ~access_ok & ~stall_in):
 755             sync += r.fetch_failed.eq(1)
 756
 757     # icache_log: if LOG_LENGTH > 0 generate
 758     def icache_log(self, m, req_hit_way, ra_valid, access_ok,
 759                    req_is_miss, req_is_hit, lway, wstate, r):
 760         comb = m.d.comb
 761         sync = m.d.sync
 762
 763         bus, i_out       = self.bus, self.i_out
 764         log_out, stall_out = self.log_out, self.stall_out
 765
 766         # Output data to logger
 767         for i in range(LOG_LENGTH):
 768             log_data = Signal(54)
 769             lway     = Signal(WAY_BITS)
 770             wstate   = Signal()
 771
 772             sync += lway.eq(req_hit_way)
 773             sync += wstate.eq(0)
 774
 775             with m.If(r.state != State.IDLE):
 776                 sync += wstate.eq(1)
 777
 778             sync += log_data.eq(Cat(
 779                      ra_valid, access_ok, req_is_miss, req_is_hit,
 780                      lway, wstate, r.hit_nia[2:6], r.fetch_failed,
 781                      stall_out, bus.stall, r.wb.cyc, r.wb.stb,
 782                      r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
 783                     ))
 784             comb += log_out.eq(log_data)
 785
 786     def elaborate(self, platform):
 787
 788         m                = Module()
 789         comb             = m.d.comb
 790
 791         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
 792         cache_tags       = CacheTagArray()
 793
 794         # TLB Array
 795         itlb            = TLBArray()
 796
 797         # TODO to be passed to nmigen as ram attributes
 798         # attribute ram_style of itlb_tags : signal is "distributed";
 799         # attribute ram_style of itlb_ptes : signal is "distributed";
 800
 801         # Privilege bit from PTE EAA field
 802         eaa_priv         = Signal()
 803
 804         r                = RegInternal()
 805
 806         # Async signal on incoming request
 807         req_index        = Signal(INDEX_BITS)
 808         req_row          = Signal(ROW_BITS)
 809         req_hit_way      = Signal(WAY_BITS)
 810         req_tag          = Signal(TAG_BITS)
 811         req_is_hit       = Signal()
 812         req_is_miss      = Signal()
 813         req_laddr        = Signal(64)
 814
 815         tlb_req_index    = Signal(TLB_SIZE)
 816         real_addr        = Signal(REAL_ADDR_BITS)
 817         ra_valid         = Signal()
 818         priv_fault       = Signal()
 819         access_ok        = Signal()
 820         use_previous     = Signal()
 821
 822         cache_out_row    = Signal(ROW_SIZE_BITS)
 823
 824         plru_victim      = PLRUOut()
 825         replace_way      = Signal(WAY_BITS)
 826
 827         # fake-up the wishbone stall signal to comply with pipeline mode
 828         # same thing is done in dcache.py
 829         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
 830
 831         # call sub-functions putting everything together,
 832         # using shared signals established above
 833         self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
 834         self.maybe_plrus(m, r, plru_victim)
 835         self.itlb_lookup(m, tlb_req_index, itlb, real_addr,
 836                          ra_valid, eaa_priv, priv_fault,
 837                          access_ok)
 838         self.itlb_update(m, itlb)
 839         self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
 840                          req_tag, real_addr, req_laddr,
 841                          cache_tags, access_ok, req_is_hit, req_is_miss,
 842                          replace_way, plru_victim, cache_out_row)
 843         self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
 844                         req_index, req_tag, real_addr)
 845         self.icache_miss(m, r, req_is_miss, req_index,
 846                          req_laddr, req_tag, replace_way, cache_tags,
 847                          access_ok, real_addr)
 848         #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
 849         #                req_is_miss, req_is_hit, lway, wstate, r)
 850
 851         return m
 852
 853
 854 def icache_sim(dut):
 855     i_in = dut.i_in
 856     i_out  = dut.i_out
 857     m_out = dut.m_in
 858
 859     yield i_in.priv_mode.eq(1)
 860     yield i_in.req.eq(0)
 861     yield i_in.nia.eq(0)
 862     yield i_in.stop_mark.eq(0)
 863     yield m_out.tlbld.eq(0)
 864     yield m_out.tlbie.eq(0)
 865     yield m_out.addr.eq(0)
 866     yield m_out.pte.eq(0)
 867     yield
 868     yield
 869     yield
 870     yield
 871
 872     # miss, stalls for a bit
 873     yield i_in.req.eq(1)
 874     yield i_in.nia.eq(Const(0x0000000000000004, 64))
 875     yield
 876     valid = yield i_out.valid
 877     while not valid:
 878         yield
 879         valid = yield i_out.valid
 880     yield i_in.req.eq(0)
 881
 882     insn  = yield i_out.insn
 883     nia   = yield i_out.nia
 884     assert insn == 0x00000001, \
 885         "insn @%x=%x expected 00000001" % (nia, insn)
 886     yield i_in.req.eq(0)
 887     yield
 888
 889     # hit
 890     yield i_in.req.eq(1)
 891     yield i_in.nia.eq(Const(0x0000000000000008, 64))
 892     yield
 893     valid = yield i_out.valid
 894     while not valid:
 895         yield
 896         valid = yield i_out.valid
 897     yield i_in.req.eq(0)
 898
 899     nia   = yield i_out.nia
 900     insn  = yield i_out.insn
 901     yield
 902     assert insn == 0x00000002, \
 903         "insn @%x=%x expected 00000002" % (nia, insn)
 904
 905     # another miss
 906     yield i_in.req.eq(1)
 907     yield i_in.nia.eq(Const(0x0000000000000040, 64))
 908     yield
 909     valid = yield i_out.valid
 910     while not valid:
 911         yield
 912         valid = yield i_out.valid
 913     yield i_in.req.eq(0)
 914
 915     nia   = yield i_in.nia
 916     insn  = yield i_out.insn
 917     assert insn == 0x00000010, \
 918         "insn @%x=%x expected 00000010" % (nia, insn)
 919
 920     # test something that aliases (this only works because
 921     # the unit test SRAM is a depth of 512)
 922     yield i_in.req.eq(1)
 923     yield i_in.nia.eq(Const(0x0000000000000100, 64))
 924     yield
 925     yield
 926     valid = yield i_out.valid
 927     assert ~valid
 928     for i in range(30):
 929         yield
 930     yield
 931     insn  = yield i_out.insn
 932     valid = yield i_out.valid
 933     insn  = yield i_out.insn
 934     assert valid
 935     assert insn == 0x00000040, \
 936          "insn @%x=%x expected 00000040" % (nia, insn)
 937     yield i_in.req.eq(0)
 938
 939
 940 def test_icache(mem):
 941      dut    = ICache()
 942
 943      memory = Memory(width=64, depth=512, init=mem)
 944      sram   = SRAM(memory=memory, granularity=8)
 945
 946      m      = Module()
 947
 948      m.submodules.icache = dut
 949      m.submodules.sram   = sram
 950
 951      m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
 952      m.d.comb += sram.bus.stb.eq(dut.bus.stb)
 953      m.d.comb += sram.bus.we.eq(dut.bus.we)
 954      m.d.comb += sram.bus.sel.eq(dut.bus.sel)
 955      m.d.comb += sram.bus.adr.eq(dut.bus.adr)
 956      m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
 957
 958      m.d.comb += dut.bus.ack.eq(sram.bus.ack)
 959      m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
 960
 961      # nmigen Simulation
 962      sim = Simulator(m)
 963      sim.add_clock(1e-6)
 964
 965      sim.add_sync_process(wrap(icache_sim(dut)))
 966      with sim.write_vcd('test_icache.vcd'):
 967          sim.run()
 968
 969
 970 if __name__ == '__main__':
 971     dut = ICache()
 972     vl = rtlil.convert(dut, ports=[])
 973     with open("test_icache.il", "w") as f:
 974         f.write(vl)
 975
 976     # set up memory every 32-bits with incrementing values 0 1 2 ...
 977     mem = []
 978     for i in range(512):
 979         mem.append((i*2) | ((i*2+1)<<32))
 980
 981     test_icache(mem)