src/soc/experiment/icache.py

   1 """ICache
   2
   3 based on Anton Blanchard microwatt icache.vhdl
   4
   5 Set associative icache
   6
   7 TODO (in no specific order):
   8 * Add debug interface to inspect cache content
   9 * Add snoop/invalidate path
  10 * Add multi-hit error detection
  11 * Pipelined bus interface (wb or axi)
  12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
  13 * Add optimization: service hits on partially loaded lines
  14 * Add optimization: (maybe) interrupt reload on fluch/redirect
  15 * Check if playing with the geometry of the cache tags allow for more
  16   efficient use of distributed RAM and less logic/muxes. Currently we
  17   write TAG_BITS width which may not match full ram blocks and might
  18   cause muxes to be inferred for "partial writes".
  19 * Check if making the read size of PLRU a ROM helps utilization
  20
  21 """
  22 from enum import Enum, unique
  23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
  24 from nmigen.cli import main, rtlil
  25 from nmutil.iocontrol import RecordObject
  26 from nmigen.utils import log2_int
  27 from nmutil.util import Display
  28
  29 #from nmutil.plru import PLRU
  30 from soc.experiment.cache_ram import CacheRam
  31 from soc.experiment.plru import PLRU
  32
  33 from soc.experiment.mem_types import (Fetch1ToICacheType,
  34                                       ICacheToDecode1Type,
  35                                       MMUToICacheType)
  36
  37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
  38                                      WB_SEL_BITS, WBAddrType, WBDataType,
  39                                      WBSelType, WBMasterOut, WBSlaveOut,
  40                                      WBMasterOutVector, WBSlaveOutVector,
  41                                      WBIOMasterOut, WBIOSlaveOut)
  42
  43 # for test
  44 from nmigen_soc.wishbone.sram import SRAM
  45 from nmigen import Memory
  46 from nmutil.util import wrap
  47 from nmigen.cli import main, rtlil
  48 if True:
  49     from nmigen.back.pysim import Simulator, Delay, Settle
  50 else:
  51     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  52
  53
  54 SIM            = 0
  55 LINE_SIZE      = 64
  56 # BRAM organisation: We never access more than wishbone_data_bits
  57 # at a time so to save resources we make the array only that wide,
  58 # and use consecutive indices for to make a cache "line"
  59 #
  60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
  61 ROW_SIZE       = WB_DATA_BITS // 8
  62 # Number of lines in a set
  63 NUM_LINES      = 16
  64 # Number of ways
  65 NUM_WAYS       = 4
  66 # L1 ITLB number of entries (direct mapped)
  67 TLB_SIZE       = 64
  68 # L1 ITLB log_2(page_size)
  69 TLB_LG_PGSZ    = 12
  70 # Number of real address bits that we store
  71 REAL_ADDR_BITS = 56
  72 # Non-zero to enable log data collection
  73 LOG_LENGTH     = 0
  74
  75 ROW_SIZE_BITS  = ROW_SIZE * 8
  76 # ROW_PER_LINE is the number of row
  77 # (wishbone) transactions in a line
  78 ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
  79 # BRAM_ROWS is the number of rows in
  80 # BRAM needed to represent the full icache
  81 BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
  82 # INSN_PER_ROW is the number of 32bit
  83 # instructions per BRAM row
  84 INSN_PER_ROW   = ROW_SIZE_BITS // 32
  85
  86 # Bit fields counts in the address
  87 #
  88 # INSN_BITS is the number of bits to
  89 # select an instruction in a row
  90 INSN_BITS      = log2_int(INSN_PER_ROW)
  91 # ROW_BITS is the number of bits to
  92 # select a row
  93 ROW_BITS       = log2_int(BRAM_ROWS)
  94 # ROW_LINE_BITS is the number of bits to
  95 # select a row within a line
  96 ROW_LINE_BITS   = log2_int(ROW_PER_LINE)
  97 # LINE_OFF_BITS is the number of bits for
  98 # the offset in a cache line
  99 LINE_OFF_BITS  = log2_int(LINE_SIZE)
 100 # ROW_OFF_BITS is the number of bits for
 101 # the offset in a row
 102 ROW_OFF_BITS   = log2_int(ROW_SIZE)
 103 # INDEX_BITS is the number of bits to
 104 # select a cache line
 105 INDEX_BITS     = log2_int(NUM_LINES)
 106 # SET_SIZE_BITS is the log base 2 of
 107 # the set size
 108 SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
 109 # TAG_BITS is the number of bits of
 110 # the tag part of the address
 111 TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
 112 # TAG_WIDTH is the width in bits of each way of the tag RAM
 113 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 114
 115 # WAY_BITS is the number of bits to
 116 # select a way
 117 WAY_BITS       = log2_int(NUM_WAYS)
 118 TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
 119
 120 #-- L1 ITLB.
 121 #constant TLB_BITS : natural := log2(TLB_SIZE);
 122 #constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
 123 #constant TLB_PTE_BITS : natural := 64;
 124 TLB_BITS        = log2_int(TLB_SIZE)
 125 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
 126 TLB_PTE_BITS    = 64
 127
 128 print("BRAM_ROWS       =", BRAM_ROWS)
 129 print("INDEX_BITS      =", INDEX_BITS)
 130 print("INSN_BITS       =", INSN_BITS)
 131 print("INSN_PER_ROW    =", INSN_PER_ROW)
 132 print("LINE_SIZE       =", LINE_SIZE)
 133 print("LINE_OFF_BITS   =", LINE_OFF_BITS)
 134 print("LOG_LENGTH      =", LOG_LENGTH)
 135 print("NUM_LINES       =", NUM_LINES)
 136 print("NUM_WAYS        =", NUM_WAYS)
 137 print("REAL_ADDR_BITS  =", REAL_ADDR_BITS)
 138 print("ROW_BITS        =", ROW_BITS)
 139 print("ROW_OFF_BITS    =", ROW_OFF_BITS)
 140 print("ROW_LINE_BITS   =", ROW_LINE_BITS)
 141 print("ROW_PER_LINE    =", ROW_PER_LINE)
 142 print("ROW_SIZE        =", ROW_SIZE)
 143 print("ROW_SIZE_BITS   =", ROW_SIZE_BITS)
 144 print("SET_SIZE_BITS   =", SET_SIZE_BITS)
 145 print("SIM             =", SIM)
 146 print("TAG_BITS        =", TAG_BITS)
 147 print("TAG_RAM_WIDTH   =", TAG_RAM_WIDTH)
 148 print("TAG_BITS        =", TAG_BITS)
 149 print("TLB_BITS        =", TLB_BITS)
 150 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
 151 print("TLB_LG_PGSZ     =", TLB_LG_PGSZ)
 152 print("TLB_PTE_BITS    =", TLB_PTE_BITS)
 153 print("TLB_SIZE        =", TLB_SIZE)
 154 print("WAY_BITS        =", WAY_BITS)
 155
 156 # from microwatt/utils.vhdl
 157 def ispow2(n):
 158     if ((n << 32) & ((n-1) << 32)) == 0:
 159         return True
 160
 161     else:
 162         return False
 163
 164 assert LINE_SIZE % ROW_SIZE == 0
 165 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 166 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 167 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 168 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
 169 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
 170     "geometry bits don't add up"
 171 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
 172    "geometry bits don't add up"
 173 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
 174     "geometry bits don't add up"
 175 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
 176     "geometry bits don't add up"
 177
 178 # architecture rtl of icache is
 179 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
 180 #-- ROW_PER_LINE is the number of row (wishbone
 181 #-- transactions) in a line
 182 #constant ROW_PER_LINE  : natural := LINE_SIZE / ROW_SIZE;
 183 #-- BRAM_ROWS is the number of rows in BRAM
 184 #-- needed to represent the full
 185 #-- icache
 186 #constant BRAM_ROWS     : natural := NUM_LINES * ROW_PER_LINE;
 187 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
 188 #constant INSN_PER_ROW  : natural := ROW_SIZE_BITS / 32;
 189 #-- Bit fields counts in the address
 190 #
 191 #-- INSN_BITS is the number of bits to select
 192 #-- an instruction in a row
 193 #constant INSN_BITS     : natural := log2(INSN_PER_ROW);
 194 #-- ROW_BITS is the number of bits to select a row
 195 #constant ROW_BITS      : natural := log2(BRAM_ROWS);
 196 #-- ROW_LINE_BITS is the number of bits to
 197 #-- select a row within a line
 198 #constant ROW_LINE_BITS  : natural := log2(ROW_PER_LINE);
 199 #-- LINE_OFF_BITS is the number of bits for the offset
 200 #-- in a cache line
 201 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
 202 #-- ROW_OFF_BITS is the number of bits for the offset in a row
 203 #constant ROW_OFF_BITS  : natural := log2(ROW_SIZE);
 204 #-- INDEX_BITS is the number of bits to select a cache line
 205 #constant INDEX_BITS    : natural := log2(NUM_LINES);
 206 #-- SET_SIZE_BITS is the log base 2 of the set size
 207 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
 208 #-- TAG_BITS is the number of bits of the tag part of the address
 209 #constant TAG_BITS      : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
 210 #-- WAY_BITS is the number of bits to select a way
 211 #constant WAY_BITS     : natural := log2(NUM_WAYS);
 212
 213 #-- Example of layout for 32 lines of 64 bytes:
 214 #--
 215 #-- ..  tag    |index|  line  |
 216 #-- ..         |   row   |    |
 217 #-- ..         |     |   | |00| zero          (2)
 218 #-- ..         |     |   |-|  | INSN_BITS     (1)
 219 #-- ..         |     |---|    | ROW_LINE_BITS  (3)
 220 #-- ..         |     |--- - --| LINE_OFF_BITS (6)
 221 #-- ..         |         |- --| ROW_OFF_BITS  (3)
 222 #-- ..         |----- ---|    | ROW_BITS      (8)
 223 #-- ..         |-----|        | INDEX_BITS    (5)
 224 #-- .. --------|              | TAG_BITS      (53)
 225    # Example of layout for 32 lines of 64 bytes:
 226    #
 227    # ..  tag    |index|  line  |
 228    # ..         |   row   |    |
 229    # ..         |     |   | |00| zero          (2)
 230    # ..         |     |   |-|  | INSN_BITS     (1)
 231    # ..         |     |---|    | ROW_LINE_BITS  (3)
 232    # ..         |     |--- - --| LINE_OFF_BITS (6)
 233    # ..         |         |- --| ROW_OFF_BITS  (3)
 234    # ..         |----- ---|    | ROW_BITS      (8)
 235    # ..         |-----|        | INDEX_BITS    (5)
 236    # .. --------|              | TAG_BITS      (53)
 237
 238 #subtype row_t is integer range 0 to BRAM_ROWS-1;
 239 #subtype index_t is integer range 0 to NUM_LINES-1;
 240 #subtype way_t is integer range 0 to NUM_WAYS-1;
 241 #subtype row_in_line_t is unsigned(ROW_LINE_BITS-1 downto 0);
 242 #
 243 #-- The cache data BRAM organized as described above for each way
 244 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
 245 #
 246 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
 247 #-- not handle a clean (commented) definition of the cache tags as a 3d
 248 #-- memory. For now, work around it by putting all the tags
 249 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
 250 #  type cache_tags_set_t is array(way_t) of cache_tag_t;
 251 #  type cache_tags_array_t is array(index_t) of cache_tags_set_t;
 252 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
 253 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
 254 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
 255 def CacheTagArray():
 256     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
 257                  for x in range(NUM_LINES))
 258
 259 #-- The cache valid bits
 260 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
 261 #type cache_valids_t is array(index_t) of cache_way_valids_t;
 262 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
 263 def CacheValidBitsArray():
 264     return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
 265                  for x in range(NUM_LINES))
 266
 267 def RowPerLineValidArray():
 268     return Array(Signal(name="rows_valid_%d" %x) \
 269                  for x in range(ROW_PER_LINE))
 270
 271
 272 #attribute ram_style : string;
 273 #attribute ram_style of cache_tags : signal is "distributed";
 274    # TODO to be passed to nigmen as ram attributes
 275    # attribute ram_style : string;
 276    # attribute ram_style of cache_tags : signal is "distributed";
 277
 278
 279 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
 280 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
 281 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
 282 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
 283 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
 284 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
 285 def TLBValidBitsArray():
 286     return Array(Signal(name="tlbvalid_%d" %x) \
 287                  for x in range(TLB_SIZE))
 288
 289 def TLBTagArray():
 290     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
 291                  for x in range(TLB_SIZE))
 292
 293 def TLBPtesArray():
 294     return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
 295                  for x in range(TLB_SIZE))
 296
 297
 298 #-- Cache RAM interface
 299 #type cache_ram_out_t is array(way_t) of cache_row_t;
 300 # Cache RAM interface
 301 def CacheRamOut():
 302     return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
 303                  for x in range(NUM_WAYS))
 304
 305 #-- PLRU output interface
 306 #type plru_out_t is array(index_t) of
 307 # std_ulogic_vector(WAY_BITS-1 downto 0);
 308 # PLRU output interface
 309 def PLRUOut():
 310     return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
 311                  for x in range(NUM_LINES))
 312
 313 #     -- Return the cache line index (tag index) for an address
 314 #     function get_index(addr: std_ulogic_vector(63 downto 0))
 315 #      return index_t is
 316 #     begin
 317 #         return to_integer(unsigned(
 318 #          addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
 319 #         ));
 320 #     end;
 321 # Return the cache line index (tag index) for an address
 322 def get_index(addr):
 323     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 324
 325 #     -- Return the cache row index (data memory) for an address
 326 #     function get_row(addr: std_ulogic_vector(63 downto 0))
 327 #       return row_t is
 328 #     begin
 329 #         return to_integer(unsigned(
 330 #          addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
 331 #         ));
 332 #     end;
 333 # Return the cache row index (data memory) for an address
 334 def get_row(addr):
 335     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 336
 337 #     -- Return the index of a row within a line
 338 #     function get_row_of_line(row: row_t) return row_in_line_t is
 339 #       variable row_v : unsigned(ROW_BITS-1 downto 0);
 340 #     begin
 341 #       row_v := to_unsigned(row, ROW_BITS);
 342 #         return row_v(ROW_LINE_BITS-1 downto 0);
 343 #     end;
 344 # Return the index of a row within a line
 345 def get_row_of_line(row):
 346     return row[:ROW_LINE_BITS]
 347
 348 #     -- Returns whether this is the last row of a line
 349 #     function is_last_row_addr(addr: wishbone_addr_type;
 350 #      last: row_in_line_t
 351 #     )
 352 #      return boolean is
 353 #     begin
 354 #       return unsigned(
 355 #        addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
 356 #       ) = last;
 357 #     end;
 358 # Returns whether this is the last row of a line
 359 def is_last_row_addr(addr, last):
 360     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 361
 362 #     -- Returns whether this is the last row of a line
 363 #     function is_last_row(row: row_t;
 364 #      last: row_in_line_t) return boolean is
 365 #     begin
 366 #       return get_row_of_line(row) = last;
 367 #     end;
 368 # Returns whether this is the last row of a line
 369 def is_last_row(row, last):
 370     return get_row_of_line(row) == last
 371
 372 #     -- Return the next row in the current cache line. We use a dedicated
 373 #     -- function in order to limit the size of the generated adder to be
 374 #     -- only the bits within a cache line (3 bits with default settings)
 375 #     function next_row(row: row_t) return row_t is
 376 #       variable row_v   : std_ulogic_vector(ROW_BITS-1 downto 0);
 377 #       variable row_idx : std_ulogic_vector(ROW_LINE_BITS-1 downto 0);
 378 #       variable result  : std_ulogic_vector(ROW_BITS-1 downto 0);
 379 #     begin
 380 #       row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
 381 #       row_idx := row_v(ROW_LINE_BITS-1 downto 0);
 382 #       row_v(ROW_LINE_BITS-1 downto 0) :=
 383 #        std_ulogic_vector(unsigned(row_idx) + 1);
 384 #       return to_integer(unsigned(row_v));
 385 #     end;
 386 # Return the next row in the current cache line. We use a dedicated
 387 # function in order to limit the size of the generated adder to be
 388 # only the bits within a cache line (3 bits with default settings)
 389 def next_row(row):
 390     row_v = row[0:ROW_LINE_BITS] + 1
 391     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 392 #     -- Read the instruction word for the given address in the
 393 #     -- current cache row
 394 #     function read_insn_word(addr: std_ulogic_vector(63 downto 0);
 395 #                           data: cache_row_t) return std_ulogic_vector is
 396 #       variable word: integer range 0 to INSN_PER_ROW-1;
 397 #     begin
 398 #         word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
 399 #       return data(31+word*32 downto word*32);
 400 #     end;
 401 # Read the instruction word for the given address
 402 # in the current cache row
 403 def read_insn_word(addr, data):
 404     word = addr[2:INSN_BITS+2]
 405     return data.word_select(word, 32)
 406
 407 #     -- Get the tag value from the address
 408 #     function get_tag(
 409 #      addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
 410 #     )
 411 #      return cache_tag_t is
 412 #     begin
 413 #         return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
 414 #     end;
 415 # Get the tag value from the address
 416 def get_tag(addr):
 417     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 418
 419 #     -- Read a tag from a tag memory row
 420 #     function read_tag(way: way_t; tagset: cache_tags_set_t)
 421 #      return cache_tag_t is
 422 #     begin
 423 #       return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
 424 #     end;
 425 # Read a tag from a tag memory row
 426 def read_tag(way, tagset):
 427     return tagset.word_select(way, TAG_BITS)
 428
 429 #     -- Write a tag to tag memory row
 430 #     procedure write_tag(way: in way_t;
 431 #      tagset: inout cache_tags_set_t; tag: cache_tag_t) is
 432 #     begin
 433 #       tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
 434 #     end;
 435 # Write a tag to tag memory row
 436 def write_tag(way, tagset, tag):
 437     return read_tag(way, tagset).eq(tag)
 438
 439 #     -- Simple hash for direct-mapped TLB index
 440 #     function hash_ea(addr: std_ulogic_vector(63 downto 0))
 441 #      return tlb_index_t is
 442 #         variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
 443 #     begin
 444 #         hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
 445 #                 xor addr(
 446 #                  TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
 447 #                  TLB_LG_PGSZ + TLB_BITS
 448 #                 )
 449 #                 xor addr(
 450 #                  TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
 451 #                  TLB_LG_PGSZ + 2 * TLB_BITS
 452 #                 );
 453 #         return to_integer(unsigned(hash));
 454 #     end;
 455 # Simple hash for direct-mapped TLB index
 456 def hash_ea(addr):
 457     hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
 458            TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
 459           ] ^ addr[
 460            TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
 461           ]
 462     return hsh
 463
 464
 465 # Cache reload state machine
 466 @unique
 467 class State(Enum):
 468     IDLE     = 0
 469     CLR_TAG  = 1
 470     WAIT_ACK = 2
 471
 472
 473 class RegInternal(RecordObject):
 474     def __init__(self):
 475         super().__init__()
 476         # Cache hit state (Latches for 1 cycle BRAM access)
 477         self.hit_way      = Signal(NUM_WAYS)
 478         self.hit_nia      = Signal(64)
 479         self.hit_smark    = Signal()
 480         self.hit_valid    = Signal()
 481
 482         # Cache miss state (reload state machine)
 483         self.state        = Signal(State, reset=State.IDLE)
 484         self.wb           = WBMasterOut("wb")
 485         self.req_adr      = Signal(64)
 486         self.store_way    = Signal(NUM_WAYS)
 487         self.store_index  = Signal(NUM_LINES)
 488         self.store_row    = Signal(BRAM_ROWS)
 489         self.store_tag    = Signal(TAG_BITS)
 490         self.store_valid  = Signal()
 491         self.end_row_ix   = Signal(ROW_LINE_BITS)
 492         self.rows_valid   = RowPerLineValidArray()
 493
 494         # TLB miss state
 495         self.fetch_failed = Signal()
 496
 497 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
 498 #
 499 # entity icache is
 500 #     generic (
 501 #         SIM : boolean := false;
 502 #         -- Line size in bytes
 503 #         LINE_SIZE : positive := 64;
 504 #         -- BRAM organisation: We never access more
 505 #         -- than wishbone_data_bits
 506 #         -- at a time so to save resources we make the
 507 #         -- array only that wide,
 508 #         -- and use consecutive indices for to make a cache "line"
 509 #         --
 510 #         -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
 511 #         -- so 64-bits)
 512 #         ROW_SIZE  : positive := wishbone_data_bits / 8;
 513 #         -- Number of lines in a set
 514 #         NUM_LINES : positive := 32;
 515 #         -- Number of ways
 516 #         NUM_WAYS  : positive := 4;
 517 #         -- L1 ITLB number of entries (direct mapped)
 518 #         TLB_SIZE : positive := 64;
 519 #         -- L1 ITLB log_2(page_size)
 520 #         TLB_LG_PGSZ : positive := 12;
 521 #         -- Number of real address bits that we store
 522 #         REAL_ADDR_BITS : positive := 56;
 523 #         -- Non-zero to enable log data collection
 524 #         LOG_LENGTH : natural := 0
 525 #         );
 526 #     port (
 527 #         clk          : in std_ulogic;
 528 #         rst          : in std_ulogic;
 529 #
 530 #         i_in         : in Fetch1ToIcacheType;
 531 #         i_out        : out IcacheToDecode1Type;
 532 #
 533 #         m_in         : in MmuToIcacheType;
 534 #
 535 #         stall_in     : in std_ulogic;
 536 #       stall_out    : out std_ulogic;
 537 #       flush_in     : in std_ulogic;
 538 #       inval_in     : in std_ulogic;
 539 #
 540 #         wishbone_out : out wishbone_master_out;
 541 #         wishbone_in  : in wishbone_slave_out;
 542 #
 543 #         log_out      : out std_ulogic_vector(53 downto 0)
 544 #         );
 545 # end entity icache;
 546 # 64 bit direct mapped icache. All instructions are 4B aligned.
 547 class ICache(Elaboratable):
 548     """64 bit direct mapped icache. All instructions are 4B aligned."""
 549     def __init__(self):
 550         self.i_in           = Fetch1ToICacheType(name="i_in")
 551         self.i_out          = ICacheToDecode1Type(name="i_out")
 552
 553         self.m_in           = MMUToICacheType(name="m_in")
 554
 555         self.stall_in       = Signal()
 556         self.stall_out      = Signal()
 557         self.flush_in       = Signal()
 558         self.inval_in       = Signal()
 559
 560         self.wb_out         = WBMasterOut(name="wb_out")
 561         self.wb_in          = WBSlaveOut(name="wb_in")
 562
 563         self.log_out        = Signal(54)
 564
 565
 566     # Generate a cache RAM for each way
 567     def rams(self, m, r, cache_out_row, use_previous,
 568              replace_way, req_row):
 569
 570         comb = m.d.comb
 571         sync = m.d.sync
 572
 573         wb_in, stall_in = self.wb_in, self.stall_in
 574
 575         for i in range(NUM_WAYS):
 576             do_read  = Signal(name="do_rd_%d" % i)
 577             do_write = Signal(name="do_wr_%d" % i)
 578             rd_addr  = Signal(ROW_BITS)
 579             wr_addr  = Signal(ROW_BITS)
 580             d_out    = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
 581             wr_sel   = Signal(ROW_SIZE)
 582
 583             way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
 584             setattr(m.submodules, "cacheram_%d" % i, way)
 585
 586             comb += way.rd_en.eq(do_read)
 587             comb += way.rd_addr.eq(rd_addr)
 588             comb += d_out.eq(way.rd_data_o)
 589             comb += way.wr_sel.eq(wr_sel)
 590             comb += way.wr_addr.eq(wr_addr)
 591             comb += way.wr_data.eq(wb_in.dat)
 592
 593             comb += do_read.eq(~(stall_in | use_previous))
 594             comb += do_write.eq(wb_in.ack & (replace_way == i))
 595
 596             with m.If(do_write):
 597                 sync += Display("cache write adr: %x data: %lx",
 598                                 wr_addr, way.wr_data)
 599
 600             with m.If(r.hit_way == i):
 601                 comb += cache_out_row.eq(d_out)
 602                 with m.If(do_read):
 603                     sync += Display("cache read adr: %x data: %x",
 604                                      req_row, d_out)
 605
 606             comb += rd_addr.eq(req_row)
 607             comb += wr_addr.eq(r.store_row)
 608             comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
 609
 610     # Generate PLRUs
 611     def maybe_plrus(self, m, r, plru_victim):
 612         comb = m.d.comb
 613
 614         with m.If(NUM_WAYS > 1):
 615             for i in range(NUM_LINES):
 616                 plru_acc_i  = Signal(WAY_BITS)
 617                 plru_acc_en = Signal()
 618                 plru        = PLRU(WAY_BITS)
 619                 setattr(m.submodules, "plru_%d" % i, plru)
 620
 621                 comb += plru.acc_i.eq(plru_acc_i)
 622                 comb += plru.acc_en.eq(plru_acc_en)
 623
 624                 # PLRU interface
 625                 with m.If(get_index(r.hit_nia) == i):
 626                     comb += plru.acc_en.eq(r.hit_valid)
 627
 628                 comb += plru.acc_i.eq(r.hit_way)
 629                 comb += plru_victim[i].eq(plru.lru_o)
 630
 631     # TLB hit detection and real address generation
 632     def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
 633                     real_addr, itlb_valid_bits, ra_valid, eaa_priv,
 634                     priv_fault, access_ok):
 635
 636         comb = m.d.comb
 637
 638         i_in = self.i_in
 639
 640         pte  = Signal(TLB_PTE_BITS)
 641         ttag = Signal(TLB_EA_TAG_BITS)
 642
 643         comb += tlb_req_index.eq(hash_ea(i_in.nia))
 644         comb += pte.eq(itlb_ptes[tlb_req_index])
 645         comb += ttag.eq(itlb_tags[tlb_req_index])
 646
 647         with m.If(i_in.virt_mode):
 648             comb += real_addr.eq(Cat(
 649                      i_in.nia[:TLB_LG_PGSZ],
 650                      pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
 651                     ))
 652
 653             with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
 654                 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
 655
 656             comb += eaa_priv.eq(pte[3])
 657
 658         with m.Else():
 659             comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
 660             comb += ra_valid.eq(1)
 661             comb += eaa_priv.eq(1)
 662
 663         # No IAMR, so no KUEP support for now
 664         comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
 665         comb += access_ok.eq(ra_valid & ~priv_fault)
 666
 667     # iTLB update
 668     def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
 669         comb = m.d.comb
 670         sync = m.d.sync
 671
 672         m_in = self.m_in
 673
 674         wr_index = Signal(TLB_SIZE)
 675         comb += wr_index.eq(hash_ea(m_in.addr))
 676
 677         with m.If(m_in.tlbie & m_in.doall):
 678             # Clear all valid bits
 679             for i in range(TLB_SIZE):
 680                 sync += itlb_valid_bits[i].eq(0)
 681
 682         with m.Elif(m_in.tlbie):
 683             # Clear entry regardless of hit or miss
 684             sync += itlb_valid_bits[wr_index].eq(0)
 685
 686         with m.Elif(m_in.tlbld):
 687             sync += itlb_tags[wr_index].eq(
 688                      m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
 689                     )
 690             sync += itlb_ptes[wr_index].eq(m_in.pte)
 691             sync += itlb_valid_bits[wr_index].eq(1)
 692
 693     # Cache hit detection, output to fetch2 and other misc logic
 694     def icache_comb(self, m, use_previous, r, req_index, req_row,
 695                     req_hit_way, req_tag, real_addr, req_laddr,
 696                     cache_valid_bits, cache_tags, access_ok,
 697                     req_is_hit, req_is_miss, replace_way,
 698                     plru_victim, cache_out_row):
 699
 700         comb = m.d.comb
 701
 702         i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
 703         flush_in, stall_out = self.flush_in, self.stall_out
 704
 705         is_hit  = Signal()
 706         hit_way = Signal(NUM_WAYS)
 707
 708         # i_in.sequential means that i_in.nia this cycle is 4 more than
 709         # last cycle.  If we read more than 32 bits at a time, had a
 710         # cache hit last cycle, and we don't want the first 32-bit chunk
 711         # then we can keep the data we read last cycle and just use that.
 712         with m.If(i_in.nia[2:INSN_BITS+2] != 0):
 713             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 714
 715         # Extract line, row and tag from request
 716         comb += req_index.eq(get_index(i_in.nia))
 717         comb += req_row.eq(get_row(i_in.nia))
 718         comb += req_tag.eq(get_tag(real_addr))
 719
 720         # Calculate address of beginning of cache row, will be
 721         # used for cache miss processing if needed
 722         comb += req_laddr.eq(Cat(
 723                  Const(0, ROW_OFF_BITS),
 724                  real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
 725                 ))
 726
 727         # Test if pending request is a hit on any way
 728         hitcond = Signal()
 729         comb += hitcond.eq((r.state == State.WAIT_ACK)
 730                     & (req_index == r.store_index)
 731                     & r.rows_valid[req_row % ROW_PER_LINE])
 732         with m.If(i_in.req):
 733             cvb = Signal(NUM_WAYS)
 734             ctag = Signal(TAG_RAM_WIDTH)
 735             comb += ctag.eq(cache_tags[req_index])
 736             comb += cvb.eq(cache_valid_bits[req_index])
 737             for i in range(NUM_WAYS):
 738                 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
 739                 comb += tagi.eq(read_tag(i, ctag))
 740                 hit_test = Signal(name="hit_test%d" % i)
 741                 comb += hit_test.eq(i == r.store_way)
 742                 with m.If((cvb[i] | (hitcond & hit_test))
 743                           & (tagi == req_tag)):
 744                     comb += hit_way.eq(i)
 745                     comb += is_hit.eq(1)
 746
 747         # Generate the "hit" and "miss" signals
 748         # for the synchronous blocks
 749         with m.If(i_in.req & access_ok & ~flush_in):
 750             comb += req_is_hit.eq(is_hit)
 751             comb += req_is_miss.eq(~is_hit)
 752
 753         with m.Else():
 754             comb += req_is_hit.eq(0)
 755             comb += req_is_miss.eq(0)
 756
 757         comb += req_hit_way.eq(hit_way)
 758
 759         # The way to replace on a miss
 760         with m.If(r.state == State.CLR_TAG):
 761             comb += replace_way.eq(plru_victim[r.store_index])
 762         with m.Else():
 763             comb += replace_way.eq(r.store_way)
 764
 765         # Output instruction from current cache row
 766         #
 767         # Note: This is a mild violation of our design principle of
 768         # having pipeline stages output from a clean latch. In this
 769         # case we output the result of a mux. The alternative would
 770         # be output an entire row which I prefer not to do just yet
 771         # as it would force fetch2 to know about some of the cache
 772         # geometry information.
 773         comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
 774         comb += i_out.valid.eq(r.hit_valid)
 775         comb += i_out.nia.eq(r.hit_nia)
 776         comb += i_out.stop_mark.eq(r.hit_smark)
 777         comb += i_out.fetch_failed.eq(r.fetch_failed)
 778
 779         # Stall fetch1 if we have a miss on cache or TLB
 780         # or a protection fault
 781         comb += stall_out.eq(~(is_hit & access_ok))
 782
 783         # Wishbone requests output (from the cache miss reload machine)
 784         comb += wb_out.eq(r.wb)
 785
 786     # Cache hit synchronous machine
 787     def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
 788                    req_index, req_tag, real_addr):
 789         sync = m.d.sync
 790
 791         i_in, stall_in = self.i_in, self.stall_in
 792         flush_in       = self.flush_in
 793
 794         # keep outputs to fetch2 unchanged on a stall
 795         # except that flush or reset sets valid to 0
 796         # If use_previous, keep the same data as last
 797         # cycle and use the second half
 798         with m.If(stall_in | use_previous):
 799             with m.If(flush_in):
 800                 sync += r.hit_valid.eq(0)
 801         with m.Else():
 802             # On a hit, latch the request for the next cycle,
 803             # when the BRAM data will be available on the
 804             # cache_out output of the corresponding way
 805             sync += r.hit_valid.eq(req_is_hit)
 806
 807             with m.If(req_is_hit):
 808                 sync += r.hit_way.eq(req_hit_way)
 809                 sync += Display(
 810                          "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
 811                          "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
 812                          i_in.stop_mark, req_index, req_tag, \
 813                          req_hit_way, real_addr
 814                         )
 815
 816
 817
 818         with m.If(~stall_in):
 819             # Send stop marks and NIA down regardless of validity
 820             sync += r.hit_smark.eq(i_in.stop_mark)
 821             sync += r.hit_nia.eq(i_in.nia)
 822
 823     def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
 824                          req_index, req_tag, replace_way, real_addr):
 825         comb = m.d.comb
 826         sync = m.d.sync
 827
 828         i_in = self.i_in
 829
 830         # Reset per-row valid flags, only used in WAIT_ACK
 831         for i in range(ROW_PER_LINE):
 832             sync += r.rows_valid[i].eq(0)
 833
 834         # We need to read a cache line
 835         with m.If(req_is_miss):
 836             sync += Display(
 837                      "cache miss nia:%x IR:%x SM:%x idx:%x "
 838                      " way:%x tag:%x RA:%x", i_in.nia,
 839                      i_in.virt_mode, i_in.stop_mark, req_index,
 840                      replace_way, req_tag, real_addr
 841                     )
 842
 843             # Keep track of our index and way for subsequent stores
 844             st_row = Signal(BRAM_ROWS)
 845             comb += st_row.eq(get_row(req_laddr))
 846             sync += r.store_index.eq(req_index)
 847             sync += r.store_row.eq(st_row)
 848             sync += r.store_tag.eq(req_tag)
 849             sync += r.store_valid.eq(1)
 850             sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
 851
 852             # Prep for first wishbone read.  We calculate the address
 853             # of the start of the cache line and start the WB cycle.
 854             sync += r.req_adr.eq(req_laddr)
 855             sync += r.wb.cyc.eq(1)
 856             sync += r.wb.stb.eq(1)
 857
 858             # Track that we had one request sent
 859             sync += r.state.eq(State.CLR_TAG)
 860
 861     def icache_miss_clr_tag(self, m, r, replace_way,
 862                             cache_valid_bits, req_index,
 863                             tagset, cache_tags):
 864
 865         comb = m.d.comb
 866         sync = m.d.sync
 867
 868         # Get victim way from plru
 869         sync += r.store_way.eq(replace_way)
 870         # Force misses on that way while reloading that line
 871         cv = Signal(INDEX_BITS)
 872         comb += cv.eq(cache_valid_bits[req_index])
 873         comb += cv.bit_select(replace_way, 1).eq(0)
 874         sync += cache_valid_bits[req_index].eq(cv)
 875
 876         for i in range(NUM_WAYS):
 877             with m.If(i == replace_way):
 878                 comb += tagset.eq(cache_tags[r.store_index])
 879                 comb += write_tag(i, tagset, r.store_tag)
 880                 sync += cache_tags[r.store_index].eq(tagset)
 881
 882         sync += r.state.eq(State.WAIT_ACK)
 883
 884     def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
 885                              stbs_done, cache_valid_bits):
 886         comb = m.d.comb
 887         sync = m.d.sync
 888
 889         wb_in = self.wb_in
 890
 891         # Requests are all sent if stb is 0
 892         stbs_zero = Signal()
 893         comb += stbs_zero.eq(r.wb.stb == 0)
 894         comb += stbs_done.eq(stbs_zero)
 895
 896         # If we are still sending requests, was one accepted?
 897         with m.If(~wb_in.stall & ~stbs_zero):
 898             # That was the last word ?  # We are done sending.
 899             # Clear stb and set stbs_done # so we can handle
 900             # an eventual last ack on # the same cycle.
 901             with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
 902                 sync += Display(
 903                          "IS_LAST_ROW_ADDR r.wb.addr:%x " \
 904                          "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
 905                          "stbs_done:%x", r.wb.adr, r.end_row_ix,
 906                          r.wb.stb, stbs_zero, stbs_done
 907                         )
 908                 sync += r.wb.stb.eq(0)
 909                 comb += stbs_done.eq(1)
 910
 911             # Calculate the next row address
 912             rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
 913             comb += rarange.eq(
 914                      r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
 915                     )
 916             sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
 917                      rarange
 918                     )
 919             sync += Display("RARANGE r.req_adr:%x rarange:%x "
 920                             "stbs_zero:%x stbs_done:%x",
 921                             r.req_adr, rarange, stbs_zero, stbs_done)
 922
 923         # Incoming acks processing
 924         with m.If(wb_in.ack):
 925             sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
 926                             "stbs_done:%x",
 927                             wb_in.dat, stbs_zero, stbs_done)
 928
 929             sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
 930
 931             # Check for completion
 932             with m.If(stbs_done &
 933                       is_last_row(r.store_row, r.end_row_ix)):
 934                 # Complete wishbone cycle
 935                 sync += r.wb.cyc.eq(0)
 936                 sync += r.req_adr.eq(0) # be nice, clear addr
 937
 938                 # Cache line is now valid
 939                 cv = Signal(INDEX_BITS)
 940                 comb += cv.eq(cache_valid_bits[r.store_index])
 941                 comb += cv.bit_select(replace_way, 1).eq(
 942                          r.store_valid & ~inval_in
 943                         )
 944                 sync += cache_valid_bits[r.store_index].eq(cv)
 945
 946                 sync += r.state.eq(State.IDLE)
 947
 948             # not completed, move on to next request in row
 949             with m.Else():
 950                 # Increment store row counter
 951                 sync += r.store_row.eq(next_row(r.store_row))
 952
 953
 954     # Cache miss/reload synchronous machine
 955     def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
 956                     req_index, req_laddr, req_tag, replace_way,
 957                     cache_tags, access_ok, real_addr):
 958         comb = m.d.comb
 959         sync = m.d.sync
 960
 961         i_in, wb_in, m_in  = self.i_in, self.wb_in, self.m_in
 962         stall_in, flush_in = self.stall_in, self.flush_in
 963         inval_in           = self.inval_in
 964
 965 #       variable tagset    : cache_tags_set_t;
 966 #       variable stbs_done : boolean;
 967
 968         tagset    = Signal(TAG_RAM_WIDTH)
 969         stbs_done = Signal()
 970
 971         comb += r.wb.sel.eq(-1)
 972         comb += r.wb.adr.eq(r.req_adr[3:])
 973
 974         # Process cache invalidations
 975         with m.If(inval_in):
 976             for i in range(NUM_LINES):
 977                 sync += cache_valid_bits[i].eq(0)
 978             sync += r.store_valid.eq(0)
 979
 980         # Main state machine
 981         with m.Switch(r.state):
 982
 983             with m.Case(State.IDLE):
 984                 self.icache_miss_idle(
 985                     m, r, req_is_miss, req_laddr,
 986                     req_index, req_tag, replace_way,
 987                     real_addr
 988                 )
 989
 990             with m.Case(State.CLR_TAG, State.WAIT_ACK):
 991                 with m.If(r.state == State.CLR_TAG):
 992                     self.icache_miss_clr_tag(
 993                         m, r, replace_way,
 994                         cache_valid_bits, req_index,
 995                         tagset, cache_tags
 996                     )
 997
 998                 self.icache_miss_wait_ack(
 999                     m, r, replace_way, inval_in,
1000                     stbs_done, cache_valid_bits
1001                 )
1002
1003         # TLB miss and protection fault processing
1004         with m.If(flush_in | m_in.tlbld):
1005             sync += r.fetch_failed.eq(0)
1006         with m.Elif(i_in.req & ~access_ok & ~stall_in):
1007             sync += r.fetch_failed.eq(1)
1008
1009     #  icache_log: if LOG_LENGTH > 0 generate
1010     def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1011                    req_is_miss, req_is_hit, lway, wstate, r):
1012         comb = m.d.comb
1013         sync = m.d.sync
1014
1015         wb_in, i_out       = self.wb_in, self.i_out
1016         log_out, stall_out = self.log_out, self.stall_out
1017
1018 #         -- Output data to logger
1019 #         signal log_data    : std_ulogic_vector(53 downto 0);
1020 #     begin
1021 #         data_log: process(clk)
1022 #             variable lway: way_t;
1023 #             variable wstate: std_ulogic;
1024         # Output data to logger
1025         for i in range(LOG_LENGTH):
1026             # Output data to logger
1027             log_data = Signal(54)
1028             lway     = Signal(NUM_WAYS)
1029             wstate   = Signal()
1030
1031 #         begin
1032 #             if rising_edge(clk) then
1033 #                 lway := req_hit_way;
1034 #                 wstate := '0';
1035             sync += lway.eq(req_hit_way)
1036             sync += wstate.eq(0)
1037
1038 #                 if r.state /= IDLE then
1039 #                     wstate := '1';
1040 #                 end if;
1041             with m.If(r.state != State.IDLE):
1042                 sync += wstate.eq(1)
1043
1044 #                 log_data <= i_out.valid &
1045 #                             i_out.insn &
1046 #                             wishbone_in.ack &
1047 #                             r.wb.adr(5 downto 3) &
1048 #                             r.wb.stb & r.wb.cyc &
1049 #                             wishbone_in.stall &
1050 #                             stall_out &
1051 #                             r.fetch_failed &
1052 #                             r.hit_nia(5 downto 2) &
1053 #                             wstate &
1054 #                             std_ulogic_vector(to_unsigned(lway, 3)) &
1055 #                             req_is_hit & req_is_miss &
1056 #                             access_ok &
1057 #                             ra_valid;
1058             sync += log_data.eq(Cat(
1059                      ra_valid, access_ok, req_is_miss, req_is_hit,
1060                      lway, wstate, r.hit_nia[2:6], r.fetch_failed,
1061                      stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
1062                      r.wb.adr[3:6], wb_in.ack, i_out.insn, i_out.valid
1063                     ))
1064 #             end if;
1065 #         end process;
1066 #         log_out <= log_data;
1067             comb += log_out.eq(log_data)
1068 #     end generate;
1069 # end;
1070
1071     def elaborate(self, platform):
1072
1073         m                = Module()
1074         comb             = m.d.comb
1075
1076         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1077         cache_tags       = CacheTagArray()
1078         cache_valid_bits = CacheValidBitsArray()
1079
1080         itlb_valid_bits  = TLBValidBitsArray()
1081         itlb_tags        = TLBTagArray()
1082         itlb_ptes        = TLBPtesArray()
1083         # TODO to be passed to nmigen as ram attributes
1084         # attribute ram_style of itlb_tags : signal is "distributed";
1085         # attribute ram_style of itlb_ptes : signal is "distributed";
1086
1087         # Privilege bit from PTE EAA field
1088         eaa_priv         = Signal()
1089
1090         r                = RegInternal()
1091
1092         # Async signal on incoming request
1093         req_index        = Signal(NUM_LINES)
1094         req_row          = Signal(BRAM_ROWS)
1095         req_hit_way      = Signal(NUM_WAYS)
1096         req_tag          = Signal(TAG_BITS)
1097         req_is_hit       = Signal()
1098         req_is_miss      = Signal()
1099         req_laddr        = Signal(64)
1100
1101         tlb_req_index    = Signal(TLB_SIZE)
1102         real_addr        = Signal(REAL_ADDR_BITS)
1103         ra_valid         = Signal()
1104         priv_fault       = Signal()
1105         access_ok        = Signal()
1106         use_previous     = Signal()
1107
1108         cache_out_row    = Signal(ROW_SIZE_BITS)
1109
1110         plru_victim      = PLRUOut()
1111         replace_way      = Signal(NUM_WAYS)
1112
1113         # call sub-functions putting everything together,
1114         # using shared signals established above
1115         self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
1116         self.maybe_plrus(m, r, plru_victim)
1117         self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
1118                          itlb_valid_bits, ra_valid, eaa_priv, priv_fault,
1119                          access_ok)
1120         self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1121         self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
1122                          req_tag, real_addr, req_laddr, cache_valid_bits,
1123                          cache_tags, access_ok, req_is_hit, req_is_miss,
1124                          replace_way, plru_victim, cache_out_row)
1125         self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1126                         req_index, req_tag, real_addr)
1127         self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1128                          req_laddr, req_tag, replace_way, cache_tags,
1129                          access_ok, real_addr)
1130         #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1131         #                req_is_miss, req_is_hit, lway, wstate, r)
1132
1133         return m
1134
1135
1136 def icache_sim(dut):
1137     i_out = dut.i_in
1138     i_in  = dut.i_out
1139     m_out = dut.m_in
1140
1141     yield i_in.valid.eq(0)
1142     yield i_out.priv_mode.eq(1)
1143     yield i_out.req.eq(0)
1144     yield i_out.nia.eq(0)
1145     yield i_out.stop_mark.eq(0)
1146     yield m_out.tlbld.eq(0)
1147     yield m_out.tlbie.eq(0)
1148     yield m_out.addr.eq(0)
1149     yield m_out.pte.eq(0)
1150     yield
1151     yield
1152     yield
1153     yield
1154     yield i_out.req.eq(1)
1155     yield i_out.nia.eq(Const(0x0000000000000004, 64))
1156     for i in range(30):
1157         yield
1158     yield
1159     valid = yield i_in.valid
1160     nia   = yield i_out.nia
1161     insn  = yield i_in.insn
1162     print(f"valid? {valid}")
1163     assert valid
1164     assert insn == 0x00000001, \
1165         "insn @%x=%x expected 00000001" % (nia, insn)
1166     yield i_out.req.eq(0)
1167     yield
1168
1169     # hit
1170     yield
1171     yield
1172     yield i_out.req.eq(1)
1173     yield i_out.nia.eq(Const(0x0000000000000008, 64))
1174     yield
1175     yield
1176     valid = yield i_in.valid
1177     nia   = yield i_in.nia
1178     insn  = yield i_in.insn
1179     assert valid
1180     assert insn == 0x00000002, \
1181         "insn @%x=%x expected 00000002" % (nia, insn)
1182     yield
1183
1184     # another miss
1185     yield i_out.req.eq(1)
1186     yield i_out.nia.eq(Const(0x0000000000000040, 64))
1187     for i in range(30):
1188         yield
1189     yield
1190     valid = yield i_in.valid
1191     nia   = yield i_out.nia
1192     insn  = yield i_in.insn
1193     assert valid
1194     assert insn == 0x00000010, \
1195         "insn @%x=%x expected 00000010" % (nia, insn)
1196
1197     # test something that aliases
1198     yield i_out.req.eq(1)
1199     yield i_out.nia.eq(Const(0x0000000000000100, 64))
1200     yield
1201     yield
1202     valid = yield i_in.valid
1203     assert ~valid
1204     for i in range(30):
1205         yield
1206     yield
1207     insn  = yield i_in.insn
1208     valid = yield i_in.valid
1209     insn  = yield i_in.insn
1210     assert valid
1211     assert insn == 0x00000040, \
1212          "insn @%x=%x expected 00000040" % (nia, insn)
1213     yield i_out.req.eq(0)
1214
1215
1216
1217 def test_icache(mem):
1218      dut    = ICache()
1219
1220      memory = Memory(width=64, depth=512, init=mem)
1221      sram   = SRAM(memory=memory, granularity=8)
1222
1223      m      = Module()
1224
1225      m.submodules.icache = dut
1226      m.submodules.sram   = sram
1227
1228      m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1229      m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1230      m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1231      m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1232      m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1233      m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1234
1235      m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1236      m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1237
1238      # nmigen Simulation
1239      sim = Simulator(m)
1240      sim.add_clock(1e-6)
1241
1242      sim.add_sync_process(wrap(icache_sim(dut)))
1243      with sim.write_vcd('test_icache.vcd'):
1244          sim.run()
1245
1246 if __name__ == '__main__':
1247     dut = ICache()
1248     vl = rtlil.convert(dut, ports=[])
1249     with open("test_icache.il", "w") as f:
1250         f.write(vl)
1251
1252     mem = []
1253     for i in range(512):
1254         mem.append((i*2)| ((i*2+1)<<32))
1255
1256     test_icache(mem)
1257