src/soc/experiment/icache.py

   1 """ICache
   2
   3 based on Anton Blanchard microwatt icache.vhdl
   4
   5 Set associative icache
   6
   7 TODO (in no specific order):
   8 * Add debug interface to inspect cache content
   9 * Add snoop/invalidate path
  10 * Add multi-hit error detection
  11 * Pipelined bus interface (wb or axi)
  12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
  13 * Add optimization: service hits on partially loaded lines
  14 * Add optimization: (maybe) interrupt reload on fluch/redirect
  15 * Check if playing with the geometry of the cache tags allow for more
  16   efficient use of distributed RAM and less logic/muxes. Currently we
  17   write TAG_BITS width which may not match full ram blocks and might
  18   cause muxes to be inferred for "partial writes".
  19 * Check if making the read size of PLRU a ROM helps utilization
  20
  21 """
  22 from enum import Enum, unique
  23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
  24 from nmigen.cli import main, rtlil
  25 from nmutil.iocontrol import RecordObject
  26 from nmigen.utils import log2_int
  27 from nmutil.util import Display
  28
  29 #from nmutil.plru import PLRU
  30 from soc.experiment.cache_ram import CacheRam
  31 from soc.experiment.plru import PLRU
  32
  33 from soc.experiment.mem_types import (Fetch1ToICacheType,
  34                                       ICacheToDecode1Type,
  35                                       MMUToICacheType)
  36
  37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
  38                                      WB_SEL_BITS, WBAddrType, WBDataType,
  39                                      WBSelType, WBMasterOut, WBSlaveOut,
  40                                      WBMasterOutVector, WBSlaveOutVector,
  41                                      WBIOMasterOut, WBIOSlaveOut)
  42
  43 # for test
  44 from nmigen_soc.wishbone.sram import SRAM
  45 from nmigen import Memory
  46 from nmutil.util import wrap
  47 from nmigen.cli import main, rtlil
  48 if True:
  49     from nmigen.back.pysim import Simulator, Delay, Settle
  50 else:
  51     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  52
  53
  54 SIM            = 0
  55 LINE_SIZE      = 64
  56 # BRAM organisation: We never access more than wishbone_data_bits
  57 # at a time so to save resources we make the array only that wide,
  58 # and use consecutive indices for to make a cache "line"
  59 #
  60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
  61 ROW_SIZE       = WB_DATA_BITS // 8
  62 # Number of lines in a set
  63 NUM_LINES      = 16
  64 # Number of ways
  65 NUM_WAYS       = 4
  66 # L1 ITLB number of entries (direct mapped)
  67 TLB_SIZE       = 64
  68 # L1 ITLB log_2(page_size)
  69 TLB_LG_PGSZ    = 12
  70 # Number of real address bits that we store
  71 REAL_ADDR_BITS = 56
  72 # Non-zero to enable log data collection
  73 LOG_LENGTH     = 0
  74
  75 ROW_SIZE_BITS  = ROW_SIZE * 8
  76 # ROW_PER_LINE is the number of row
  77 # (wishbone) transactions in a line
  78 ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
  79 # BRAM_ROWS is the number of rows in
  80 # BRAM needed to represent the full icache
  81 BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
  82 # INSN_PER_ROW is the number of 32bit
  83 # instructions per BRAM row
  84 INSN_PER_ROW   = ROW_SIZE_BITS // 32
  85
  86 print("ROW_SIZE", ROW_SIZE)
  87 print("ROW_SIZE_BITS", ROW_SIZE_BITS)
  88 print("ROW_PER_LINE", ROW_PER_LINE)
  89 print("BRAM_ROWS", BRAM_ROWS)
  90 print("INSN_PER_ROW", INSN_PER_ROW)
  91
  92 # Bit fields counts in the address
  93 #
  94 # INSN_BITS is the number of bits to
  95 # select an instruction in a row
  96 INSN_BITS      = log2_int(INSN_PER_ROW)
  97 # ROW_BITS is the number of bits to
  98 # select a row
  99 ROW_BITS       = log2_int(BRAM_ROWS)
 100 # ROW_LINEBITS is the number of bits to
 101 # select a row within a line
 102 ROW_LINEBITS   = log2_int(ROW_PER_LINE)
 103 # LINE_OFF_BITS is the number of bits for
 104 # the offset in a cache line
 105 LINE_OFF_BITS  = log2_int(LINE_SIZE)
 106 # ROW_OFF_BITS is the number of bits for
 107 # the offset in a row
 108 ROW_OFF_BITS   = log2_int(ROW_SIZE)
 109 # INDEX_BITS is the number of bits to
 110 # select a cache line
 111 INDEX_BITS     = log2_int(NUM_LINES)
 112 # SET_SIZE_BITS is the log base 2 of
 113 # the set size
 114 SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
 115 # TAG_BITS is the number of bits of
 116 # the tag part of the address
 117 TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
 118 # TAG_WIDTH is the width in bits of each way of the tag RAM
 119 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 120
 121 # WAY_BITS is the number of bits to
 122 # select a way
 123 WAY_BITS       = log2_int(NUM_WAYS)
 124 TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
 125
 126 #     -- L1 ITLB.
 127 #     constant TLB_BITS : natural := log2(TLB_SIZE);
 128 #     constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
 129 #     constant TLB_PTE_BITS : natural := 64;
 130 TLB_BITS        = log2_int(TLB_SIZE)
 131 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
 132 TLB_PTE_BITS    = 64
 133
 134
 135 print("INSN_BITS", INSN_BITS)
 136 print("ROW_BITS", ROW_BITS)
 137 print("ROW_LINEBITS", ROW_LINEBITS)
 138 print("LINE_OFF_BITS", LINE_OFF_BITS)
 139 print("ROW_OFF_BITS", ROW_OFF_BITS)
 140 print("INDEX_BITS", INDEX_BITS)
 141 print("SET_SIZE_BITS", SET_SIZE_BITS)
 142 print("TAG_BITS", TAG_BITS)
 143 print("WAY_BITS", WAY_BITS)
 144 print("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 145 print("TLB_BITS", TLB_BITS)
 146 print("TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 147 print("TLB_PTE_BITS", TLB_PTE_BITS)
 148
 149
 150
 151
 152 # architecture rtl of icache is
 153 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
 154 #-- ROW_PER_LINE is the number of row (wishbone
 155 #-- transactions) in a line
 156 #constant ROW_PER_LINE  : natural := LINE_SIZE / ROW_SIZE;
 157 #-- BRAM_ROWS is the number of rows in BRAM
 158 #-- needed to represent the full
 159 #-- icache
 160 #constant BRAM_ROWS     : natural := NUM_LINES * ROW_PER_LINE;
 161 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
 162 #constant INSN_PER_ROW  : natural := ROW_SIZE_BITS / 32;
 163 #-- Bit fields counts in the address
 164 #
 165 #-- INSN_BITS is the number of bits to select
 166 #-- an instruction in a row
 167 #constant INSN_BITS     : natural := log2(INSN_PER_ROW);
 168 #-- ROW_BITS is the number of bits to select a row
 169 #constant ROW_BITS      : natural := log2(BRAM_ROWS);
 170 #-- ROW_LINEBITS is the number of bits to
 171 #-- select a row within a line
 172 #constant ROW_LINEBITS  : natural := log2(ROW_PER_LINE);
 173 #-- LINE_OFF_BITS is the number of bits for the offset
 174 #-- in a cache line
 175 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
 176 #-- ROW_OFF_BITS is the number of bits for the offset in a row
 177 #constant ROW_OFF_BITS  : natural := log2(ROW_SIZE);
 178 #-- INDEX_BITS is the number of bits to select a cache line
 179 #constant INDEX_BITS    : natural := log2(NUM_LINES);
 180 #-- SET_SIZE_BITS is the log base 2 of the set size
 181 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
 182 #-- TAG_BITS is the number of bits of the tag part of the address
 183 #constant TAG_BITS      : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
 184 #-- WAY_BITS is the number of bits to select a way
 185 #constant WAY_BITS     : natural := log2(NUM_WAYS);
 186
 187 #-- Example of layout for 32 lines of 64 bytes:
 188 #--
 189 #-- ..  tag    |index|  line  |
 190 #-- ..         |   row   |    |
 191 #-- ..         |     |   | |00| zero          (2)
 192 #-- ..         |     |   |-|  | INSN_BITS     (1)
 193 #-- ..         |     |---|    | ROW_LINEBITS  (3)
 194 #-- ..         |     |--- - --| LINE_OFF_BITS (6)
 195 #-- ..         |         |- --| ROW_OFF_BITS  (3)
 196 #-- ..         |----- ---|    | ROW_BITS      (8)
 197 #-- ..         |-----|        | INDEX_BITS    (5)
 198 #-- .. --------|              | TAG_BITS      (53)
 199    # Example of layout for 32 lines of 64 bytes:
 200    #
 201    # ..  tag    |index|  line  |
 202    # ..         |   row   |    |
 203    # ..         |     |   | |00| zero          (2)
 204    # ..         |     |   |-|  | INSN_BITS     (1)
 205    # ..         |     |---|    | ROW_LINEBITS  (3)
 206    # ..         |     |--- - --| LINE_OFF_BITS (6)
 207    # ..         |         |- --| ROW_OFF_BITS  (3)
 208    # ..         |----- ---|    | ROW_BITS      (8)
 209    # ..         |-----|        | INDEX_BITS    (5)
 210    # .. --------|              | TAG_BITS      (53)
 211
 212 #subtype row_t is integer range 0 to BRAM_ROWS-1;
 213 #subtype index_t is integer range 0 to NUM_LINES-1;
 214 #subtype way_t is integer range 0 to NUM_WAYS-1;
 215 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
 216 #
 217 #-- The cache data BRAM organized as described above for each way
 218 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
 219 #
 220 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
 221 #-- not handle a clean (commented) definition of the cache tags as a 3d
 222 #-- memory. For now, work around it by putting all the tags
 223 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
 224 #  type cache_tags_set_t is array(way_t) of cache_tag_t;
 225 #  type cache_tags_array_t is array(index_t) of cache_tags_set_t;
 226 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
 227 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
 228 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
 229 def CacheTagArray():
 230     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
 231                  for x in range(NUM_LINES))
 232
 233 #-- The cache valid bits
 234 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
 235 #type cache_valids_t is array(index_t) of cache_way_valids_t;
 236 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
 237 def CacheValidBitsArray():
 238     return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
 239                  for x in range(NUM_LINES))
 240
 241 def RowPerLineValidArray():
 242     return Array(Signal(name="rows_valid_%d" %x) \
 243                  for x in range(ROW_PER_LINE))
 244
 245
 246 #attribute ram_style : string;
 247 #attribute ram_style of cache_tags : signal is "distributed";
 248    # TODO to be passed to nigmen as ram attributes
 249    # attribute ram_style : string;
 250    # attribute ram_style of cache_tags : signal is "distributed";
 251
 252
 253 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
 254 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
 255 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
 256 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
 257 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
 258 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
 259 def TLBValidBitsArray():
 260     return Array(Signal(name="tlbvalid_%d" %x) \
 261                  for x in range(TLB_SIZE))
 262
 263 def TLBTagArray():
 264     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
 265                  for x in range(TLB_SIZE))
 266
 267 def TLBPtesArray():
 268     return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
 269                  for x in range(TLB_SIZE))
 270
 271
 272 #-- Cache RAM interface
 273 #type cache_ram_out_t is array(way_t) of cache_row_t;
 274 # Cache RAM interface
 275 def CacheRamOut():
 276     return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
 277                  for x in range(NUM_WAYS))
 278
 279 #-- PLRU output interface
 280 #type plru_out_t is array(index_t) of
 281 # std_ulogic_vector(WAY_BITS-1 downto 0);
 282 # PLRU output interface
 283 def PLRUOut():
 284     return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
 285                  for x in range(NUM_LINES))
 286
 287 #     -- Return the cache line index (tag index) for an address
 288 #     function get_index(addr: std_ulogic_vector(63 downto 0))
 289 #      return index_t is
 290 #     begin
 291 #         return to_integer(unsigned(
 292 #          addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
 293 #         ));
 294 #     end;
 295 # Return the cache line index (tag index) for an address
 296 def get_index(addr):
 297     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 298
 299 #     -- Return the cache row index (data memory) for an address
 300 #     function get_row(addr: std_ulogic_vector(63 downto 0))
 301 #       return row_t is
 302 #     begin
 303 #         return to_integer(unsigned(
 304 #          addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
 305 #         ));
 306 #     end;
 307 # Return the cache row index (data memory) for an address
 308 def get_row(addr):
 309     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 310
 311 #     -- Return the index of a row within a line
 312 #     function get_row_of_line(row: row_t) return row_in_line_t is
 313 #       variable row_v : unsigned(ROW_BITS-1 downto 0);
 314 #     begin
 315 #       row_v := to_unsigned(row, ROW_BITS);
 316 #         return row_v(ROW_LINEBITS-1 downto 0);
 317 #     end;
 318 # Return the index of a row within a line
 319 def get_row_of_line(row):
 320     return row[:ROW_LINEBITS]
 321
 322 #     -- Returns whether this is the last row of a line
 323 #     function is_last_row_addr(addr: wishbone_addr_type;
 324 #      last: row_in_line_t
 325 #     )
 326 #      return boolean is
 327 #     begin
 328 #       return unsigned(
 329 #        addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
 330 #       ) = last;
 331 #     end;
 332 # Returns whether this is the last row of a line
 333 def is_last_row_addr(addr, last):
 334     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 335
 336 #     -- Returns whether this is the last row of a line
 337 #     function is_last_row(row: row_t;
 338 #      last: row_in_line_t) return boolean is
 339 #     begin
 340 #       return get_row_of_line(row) = last;
 341 #     end;
 342 # Returns whether this is the last row of a line
 343 def is_last_row(row, last):
 344     return get_row_of_line(row) == last
 345
 346 #     -- Return the next row in the current cache line. We use a dedicated
 347 #     -- function in order to limit the size of the generated adder to be
 348 #     -- only the bits within a cache line (3 bits with default settings)
 349 #     function next_row(row: row_t) return row_t is
 350 #       variable row_v   : std_ulogic_vector(ROW_BITS-1 downto 0);
 351 #       variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
 352 #       variable result  : std_ulogic_vector(ROW_BITS-1 downto 0);
 353 #     begin
 354 #       row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
 355 #       row_idx := row_v(ROW_LINEBITS-1 downto 0);
 356 #       row_v(ROW_LINEBITS-1 downto 0) :=
 357 #        std_ulogic_vector(unsigned(row_idx) + 1);
 358 #       return to_integer(unsigned(row_v));
 359 #     end;
 360 # Return the next row in the current cache line. We use a dedicated
 361 # function in order to limit the size of the generated adder to be
 362 # only the bits within a cache line (3 bits with default settings)
 363 def next_row(row):
 364     row_v = row[0:ROW_LINEBITS] + 1
 365     return Cat(row_v[:ROW_LINEBITS], row[ROW_LINEBITS:])
 366 #     -- Read the instruction word for the given address in the
 367 #     -- current cache row
 368 #     function read_insn_word(addr: std_ulogic_vector(63 downto 0);
 369 #                           data: cache_row_t) return std_ulogic_vector is
 370 #       variable word: integer range 0 to INSN_PER_ROW-1;
 371 #     begin
 372 #         word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
 373 #       return data(31+word*32 downto word*32);
 374 #     end;
 375 # Read the instruction word for the given address
 376 # in the current cache row
 377 def read_insn_word(addr, data):
 378     word = addr[2:INSN_BITS+2]
 379     return data.word_select(word, 32)
 380
 381 #     -- Get the tag value from the address
 382 #     function get_tag(
 383 #      addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
 384 #     )
 385 #      return cache_tag_t is
 386 #     begin
 387 #         return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
 388 #     end;
 389 # Get the tag value from the address
 390 def get_tag(addr):
 391     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 392
 393 #     -- Read a tag from a tag memory row
 394 #     function read_tag(way: way_t; tagset: cache_tags_set_t)
 395 #      return cache_tag_t is
 396 #     begin
 397 #       return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
 398 #     end;
 399 # Read a tag from a tag memory row
 400 def read_tag(way, tagset):
 401     return tagset.word_select(way, TAG_BITS)
 402
 403 #     -- Write a tag to tag memory row
 404 #     procedure write_tag(way: in way_t;
 405 #      tagset: inout cache_tags_set_t; tag: cache_tag_t) is
 406 #     begin
 407 #       tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
 408 #     end;
 409 # Write a tag to tag memory row
 410 def write_tag(way, tagset, tag):
 411     return read_tag(way, tagset).eq(tag)
 412
 413 #     -- Simple hash for direct-mapped TLB index
 414 #     function hash_ea(addr: std_ulogic_vector(63 downto 0))
 415 #      return tlb_index_t is
 416 #         variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
 417 #     begin
 418 #         hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
 419 #                 xor addr(
 420 #                  TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
 421 #                  TLB_LG_PGSZ + TLB_BITS
 422 #                 )
 423 #                 xor addr(
 424 #                  TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
 425 #                  TLB_LG_PGSZ + 2 * TLB_BITS
 426 #                 );
 427 #         return to_integer(unsigned(hash));
 428 #     end;
 429 # Simple hash for direct-mapped TLB index
 430 def hash_ea(addr):
 431     hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
 432            TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
 433           ] ^ addr[
 434            TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
 435           ]
 436     return hsh
 437
 438 # begin
 439 #
 440 # XXX put these assert statements in - as python asserts
 441 #
 442 #     assert LINE_SIZE mod ROW_SIZE = 0;
 443 #     assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
 444 #     assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
 445 #     assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
 446 #     assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
 447 #     assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
 448 #       report "geometry bits don't add up"
 449 #     assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
 450 #       report "geometry bits don't add up"
 451 #     assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
 452 #       report "geometry bits don't add up"
 453 #     assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
 454 #       report "geometry bits don't add up"
 455 #
 456 #     sim_debug: if SIM generate
 457 #     debug: process
 458 #     begin
 459 #       report "ROW_SIZE      = " & natural'image(ROW_SIZE);
 460 #       report "ROW_PER_LINE  = " & natural'image(ROW_PER_LINE);
 461 #       report "BRAM_ROWS     = " & natural'image(BRAM_ROWS);
 462 #       report "INSN_PER_ROW  = " & natural'image(INSN_PER_ROW);
 463 #       report "INSN_BITS     = " & natural'image(INSN_BITS);
 464 #       report "ROW_BITS      = " & natural'image(ROW_BITS);
 465 #       report "ROW_LINEBITS  = " & natural'image(ROW_LINEBITS);
 466 #       report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
 467 #       report "ROW_OFF_BITS  = " & natural'image(ROW_OFF_BITS);
 468 #       report "INDEX_BITS    = " & natural'image(INDEX_BITS);
 469 #       report "TAG_BITS      = " & natural'image(TAG_BITS);
 470 #       report "WAY_BITS      = " & natural'image(WAY_BITS);
 471 #       wait;
 472 #     end process;
 473 #     end generate;
 474
 475 # Cache reload state machine
 476 @unique
 477 class State(Enum):
 478     IDLE     = 0
 479     CLR_TAG  = 1
 480     WAIT_ACK = 2
 481
 482
 483 class RegInternal(RecordObject):
 484     def __init__(self):
 485         super().__init__()
 486         # Cache hit state (Latches for 1 cycle BRAM access)
 487         self.hit_way      = Signal(NUM_WAYS)
 488         self.hit_nia      = Signal(64)
 489         self.hit_smark    = Signal()
 490         self.hit_valid    = Signal()
 491
 492         # Cache miss state (reload state machine)
 493         self.state        = Signal(State, reset=State.IDLE)
 494         self.wb           = WBMasterOut("wb")
 495         self.req_adr      = Signal(64)
 496         self.store_way    = Signal(NUM_WAYS)
 497         self.store_index  = Signal(NUM_LINES)
 498         self.store_row    = Signal(BRAM_ROWS)
 499         self.store_tag    = Signal(TAG_BITS)
 500         self.store_valid  = Signal()
 501         self.end_row_ix   = Signal(ROW_LINEBITS)
 502         self.rows_valid   = RowPerLineValidArray()
 503
 504         # TLB miss state
 505         self.fetch_failed = Signal()
 506
 507 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
 508 #
 509 # entity icache is
 510 #     generic (
 511 #         SIM : boolean := false;
 512 #         -- Line size in bytes
 513 #         LINE_SIZE : positive := 64;
 514 #         -- BRAM organisation: We never access more
 515 #         -- than wishbone_data_bits
 516 #         -- at a time so to save resources we make the
 517 #         -- array only that wide,
 518 #         -- and use consecutive indices for to make a cache "line"
 519 #         --
 520 #         -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
 521 #         -- so 64-bits)
 522 #         ROW_SIZE  : positive := wishbone_data_bits / 8;
 523 #         -- Number of lines in a set
 524 #         NUM_LINES : positive := 32;
 525 #         -- Number of ways
 526 #         NUM_WAYS  : positive := 4;
 527 #         -- L1 ITLB number of entries (direct mapped)
 528 #         TLB_SIZE : positive := 64;
 529 #         -- L1 ITLB log_2(page_size)
 530 #         TLB_LG_PGSZ : positive := 12;
 531 #         -- Number of real address bits that we store
 532 #         REAL_ADDR_BITS : positive := 56;
 533 #         -- Non-zero to enable log data collection
 534 #         LOG_LENGTH : natural := 0
 535 #         );
 536 #     port (
 537 #         clk          : in std_ulogic;
 538 #         rst          : in std_ulogic;
 539 #
 540 #         i_in         : in Fetch1ToIcacheType;
 541 #         i_out        : out IcacheToDecode1Type;
 542 #
 543 #         m_in         : in MmuToIcacheType;
 544 #
 545 #         stall_in     : in std_ulogic;
 546 #       stall_out    : out std_ulogic;
 547 #       flush_in     : in std_ulogic;
 548 #       inval_in     : in std_ulogic;
 549 #
 550 #         wishbone_out : out wishbone_master_out;
 551 #         wishbone_in  : in wishbone_slave_out;
 552 #
 553 #         log_out      : out std_ulogic_vector(53 downto 0)
 554 #         );
 555 # end entity icache;
 556 # 64 bit direct mapped icache. All instructions are 4B aligned.
 557 class ICache(Elaboratable):
 558     """64 bit direct mapped icache. All instructions are 4B aligned."""
 559     def __init__(self):
 560         self.i_in           = Fetch1ToICacheType(name="i_in")
 561         self.i_out          = ICacheToDecode1Type(name="i_out")
 562
 563         self.m_in           = MMUToICacheType(name="m_in")
 564
 565         self.stall_in       = Signal()
 566         self.stall_out      = Signal()
 567         self.flush_in       = Signal()
 568         self.inval_in       = Signal()
 569
 570         self.wb_out         = WBMasterOut(name="wb_out")
 571         self.wb_in          = WBSlaveOut(name="wb_in")
 572
 573         self.log_out        = Signal(54)
 574
 575
 576     # Generate a cache RAM for each way
 577     def rams(self, m, r, cache_out_row, use_previous, replace_way, req_row):
 578         comb = m.d.comb
 579         sync = m.d.sync
 580
 581         wb_in, stall_in = self.wb_in, self.stall_in
 582
 583         for i in range(NUM_WAYS):
 584             do_read  = Signal(name="do_rd_%d" % i)
 585             do_write = Signal(name="do_wr_%d" % i)
 586             rd_addr  = Signal(ROW_BITS)
 587             wr_addr  = Signal(ROW_BITS)
 588             d_out    = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
 589             wr_sel   = Signal(ROW_SIZE)
 590
 591             way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
 592             setattr(m.submodules, "cacheram_%d" % i, way)
 593
 594             comb += way.rd_en.eq(do_read)
 595             comb += way.rd_addr.eq(rd_addr)
 596             comb += d_out.eq(way.rd_data_o)
 597             comb += way.wr_sel.eq(wr_sel)
 598             comb += way.wr_addr.eq(wr_addr)
 599             comb += way.wr_data.eq(wb_in.dat)
 600
 601             comb += do_read.eq(~(stall_in | use_previous))
 602             comb += do_write.eq(wb_in.ack & (replace_way == i))
 603
 604             with m.If(do_write):
 605                 sync += Display("cache write adr: %x data: %lx",
 606                                 wr_addr, way.wr_data)
 607
 608             with m.If(r.hit_way == i):
 609                 comb += cache_out_row.eq(d_out)
 610                 with m.If(do_read):
 611                     sync += Display("cache read adr: %x data: %x",
 612                                      req_row, d_out)
 613
 614             comb += rd_addr.eq(req_row)
 615             comb += wr_addr.eq(r.store_row)
 616             comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
 617
 618     #     -- Generate PLRUs
 619     def maybe_plrus(self, m, r, plru_victim):
 620         comb = m.d.comb
 621
 622         with m.If(NUM_WAYS > 1):
 623             for i in range(NUM_LINES):
 624                 plru_acc_i  = Signal(WAY_BITS)
 625                 plru_acc_en = Signal()
 626                 plru        = PLRU(WAY_BITS)
 627                 setattr(m.submodules, "plru_%d" % i, plru)
 628
 629                 comb += plru.acc_i.eq(plru_acc_i)
 630                 comb += plru.acc_en.eq(plru_acc_en)
 631
 632                 # PLRU interface
 633                 with m.If(get_index(r.hit_nia) == i):
 634                     comb += plru.acc_en.eq(r.hit_valid)
 635
 636                 comb += plru.acc_i.eq(r.hit_way)
 637                 comb += plru_victim[i].eq(plru.lru_o)
 638
 639     # TLB hit detection and real address generation
 640     def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
 641                     real_addr, itlb_valid_bits, ra_valid, eaa_priv,
 642                     priv_fault, access_ok):
 643         comb = m.d.comb
 644
 645         i_in = self.i_in
 646
 647         pte  = Signal(TLB_PTE_BITS)
 648         ttag = Signal(TLB_EA_TAG_BITS)
 649
 650         comb += tlb_req_index.eq(hash_ea(i_in.nia))
 651         comb += pte.eq(itlb_ptes[tlb_req_index])
 652         comb += ttag.eq(itlb_tags[tlb_req_index])
 653
 654         with m.If(i_in.virt_mode):
 655             comb += real_addr.eq(Cat(
 656                      i_in.nia[:TLB_LG_PGSZ],
 657                      pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
 658                     ))
 659
 660             with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
 661                 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
 662
 663             comb += eaa_priv.eq(pte[3])
 664
 665         with m.Else():
 666             comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
 667             comb += ra_valid.eq(1)
 668             comb += eaa_priv.eq(1)
 669
 670         # No IAMR, so no KUEP support for now
 671         comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
 672         comb += access_ok.eq(ra_valid & ~priv_fault)
 673
 674     # iTLB update
 675     def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
 676         comb = m.d.comb
 677         sync = m.d.sync
 678
 679         m_in = self.m_in
 680
 681         wr_index = Signal(TLB_SIZE)
 682         comb += wr_index.eq(hash_ea(m_in.addr))
 683
 684         with m.If(m_in.tlbie & m_in.doall):
 685             # Clear all valid bits
 686             for i in range(TLB_SIZE):
 687                 sync += itlb_valid_bits[i].eq(0)
 688
 689         with m.Elif(m_in.tlbie):
 690             # Clear entry regardless of hit or miss
 691             sync += itlb_valid_bits[wr_index].eq(0)
 692
 693         with m.Elif(m_in.tlbld):
 694             sync += itlb_tags[wr_index].eq(
 695                      m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
 696                     )
 697             sync += itlb_ptes[wr_index].eq(m_in.pte)
 698             sync += itlb_valid_bits[wr_index].eq(1)
 699
 700     # Cache hit detection, output to fetch2 and other misc logic
 701     def icache_comb(self, m, use_previous, r, req_index, req_row, req_hit_way,
 702                     req_tag, real_addr, req_laddr, cache_valid_bits,
 703                     cache_tags, access_ok, req_is_hit,
 704                     req_is_miss, replace_way, plru_victim, cache_out_row):
 705         comb = m.d.comb
 706
 707         #comb += Display("ENTER icache_comb - use_previous:%x req_index:%x "
 708         #                "req_row:%x req_tag:%x real_addr:%x req_laddr:%x "
 709         #                "access_ok:%x req_is_hit:%x req_is_miss:%x "
 710         #                "replace_way:%x", use_previous, req_index, req_row,
 711         #                req_tag, real_addr, req_laddr, access_ok,
 712         #                req_is_hit, req_is_miss, replace_way)
 713
 714         i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
 715         flush_in, stall_out = self.flush_in, self.stall_out
 716
 717         is_hit  = Signal()
 718         hit_way = Signal(NUM_WAYS)
 719
 720         # i_in.sequential means that i_in.nia this cycle is 4 more than
 721         # last cycle.  If we read more than 32 bits at a time, had a
 722         # cache hit last cycle, and we don't want the first 32-bit chunk
 723         # then we can keep the data we read last cycle and just use that.
 724         with m.If(i_in.nia[2:INSN_BITS+2] != 0):
 725             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 726
 727         # Extract line, row and tag from request
 728         comb += req_index.eq(get_index(i_in.nia))
 729         comb += req_row.eq(get_row(i_in.nia))
 730         comb += req_tag.eq(get_tag(real_addr))
 731
 732         # Calculate address of beginning of cache row, will be
 733         # used for cache miss processing if needed
 734         comb += req_laddr.eq(Cat(
 735                  Const(0, ROW_OFF_BITS),
 736                  real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
 737                 ))
 738
 739         # Test if pending request is a hit on any way
 740         hitcond = Signal()
 741         comb += hitcond.eq((r.state == State.WAIT_ACK)
 742                     & (req_index == r.store_index)
 743                     & r.rows_valid[req_row % ROW_PER_LINE])
 744         with m.If(i_in.req):
 745             cvb = Signal(NUM_WAYS)
 746             ctag = Signal(TAG_RAM_WIDTH)
 747             comb += ctag.eq(cache_tags[req_index])
 748             comb += cvb.eq(cache_valid_bits[req_index])
 749             for i in range(NUM_WAYS):
 750                 tagi = Signal(TAG_BITS, name="ti%d" % i)
 751                 comb += tagi.eq(read_tag(i, ctag))
 752                 hit_test = Signal(name="hit_test%d" % i)
 753                 comb += hit_test.eq(i == r.store_way)
 754                 with m.If((cvb[i] | (hitcond & hit_test)) & (tagi == req_tag)):
 755                     comb += hit_way.eq(i)
 756                     comb += is_hit.eq(1)
 757
 758         # Generate the "hit" and "miss" signals
 759         # for the synchronous blocks
 760         with m.If(i_in.req & access_ok & ~flush_in):
 761             comb += req_is_hit.eq(is_hit)
 762             comb += req_is_miss.eq(~is_hit)
 763
 764         with m.Else():
 765             comb += req_is_hit.eq(0)
 766             comb += req_is_miss.eq(0)
 767
 768         comb += req_hit_way.eq(hit_way)
 769
 770         # The way to replace on a miss
 771         with m.If(r.state == State.CLR_TAG):
 772             comb += replace_way.eq(plru_victim[r.store_index])
 773         with m.Else():
 774             comb += replace_way.eq(r.store_way)
 775
 776         # Output instruction from current cache row
 777         #
 778         # Note: This is a mild violation of our design principle of
 779         # having pipeline stages output from a clean latch. In this
 780         # case we output the result of a mux. The alternative would
 781         # be output an entire row which I prefer not to do just yet
 782         # as it would force fetch2 to know about some of the cache
 783         # geometry information.
 784         #comb += Display("BEFORE read_insn_word - r.hit_nia:%x " \
 785         #                "r.hit_way:%x, cache_out[r.hit_way]:%x", r.hit_nia, \
 786         #                r.hit_way, cache_out[r.hit_way])
 787         comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
 788         comb += i_out.valid.eq(r.hit_valid)
 789         comb += i_out.nia.eq(r.hit_nia)
 790         comb += i_out.stop_mark.eq(r.hit_smark)
 791         comb += i_out.fetch_failed.eq(r.fetch_failed)
 792
 793         # Stall fetch1 if we have a miss on cache or TLB
 794         # or a protection fault
 795         comb += stall_out.eq(~(is_hit & access_ok))
 796
 797         # Wishbone requests output (from the cache miss reload machine)
 798         comb += wb_out.eq(r.wb)
 799
 800     # Cache hit synchronous machine
 801     def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
 802                    req_index, req_tag, real_addr):
 803         sync = m.d.sync
 804
 805         i_in, stall_in = self.i_in, self.stall_in
 806         flush_in       = self.flush_in
 807
 808         # keep outputs to fetch2 unchanged on a stall
 809         # except that flush or reset sets valid to 0
 810         # If use_previous, keep the same data as last
 811         # cycle and use the second half
 812         with m.If(stall_in | use_previous):
 813             with m.If(flush_in):
 814                 sync += r.hit_valid.eq(0)
 815         with m.Else():
 816             # On a hit, latch the request for the next cycle,
 817             # when the BRAM data will be available on the
 818             # cache_out output of the corresponding way
 819             sync += r.hit_valid.eq(req_is_hit)
 820
 821             with m.If(req_is_hit):
 822                 sync += r.hit_way.eq(req_hit_way)
 823                 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x " \
 824                                 "tag:%x way:%x RA:%x", i_in.nia, \
 825                                 i_in.virt_mode, i_in.stop_mark, req_index, \
 826                                 req_tag, req_hit_way, real_addr)
 827
 828
 829
 830         with m.If(~stall_in):
 831             # Send stop marks and NIA down regardless of validity
 832             sync += r.hit_smark.eq(i_in.stop_mark)
 833             sync += r.hit_nia.eq(i_in.nia)
 834
 835     def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
 836                          req_index, req_tag, replace_way, real_addr):
 837         comb = m.d.comb
 838         sync = m.d.sync
 839
 840         i_in = self.i_in
 841
 842         # Reset per-row valid flags,
 843         # only used in WAIT_ACK
 844         for i in range(ROW_PER_LINE):
 845             sync += r.rows_valid[i].eq(0)
 846
 847         # We need to read a cache line
 848         with m.If(req_is_miss):
 849             sync += Display(
 850                      "cache miss nia:%x IR:%x SM:%x idx:%x "
 851                      " way:%x tag:%x RA:%x", i_in.nia,
 852                      i_in.virt_mode, i_in.stop_mark, req_index,
 853                      replace_way, req_tag, real_addr
 854                     )
 855
 856             # Keep track of our index and way
 857             # for subsequent stores
 858             st_row = Signal(BRAM_ROWS)
 859             comb += st_row.eq(get_row(req_laddr))
 860             sync += r.store_index.eq(req_index)
 861             sync += r.store_row.eq(st_row)
 862             sync += r.store_tag.eq(req_tag)
 863             sync += r.store_valid.eq(1)
 864             sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
 865
 866             # Prep for first wishbone read.  We calculate the
 867             # address of the start of the cache line and
 868             # start the WB cycle.
 869             sync += r.req_adr.eq(req_laddr)
 870             sync += r.wb.cyc.eq(1)
 871             sync += r.wb.stb.eq(1)
 872
 873             # Track that we had one request sent
 874             sync += r.state.eq(State.CLR_TAG)
 875
 876     def icache_miss_clr_tag(self, m, r, replace_way,
 877                             cache_valid_bits, req_index,
 878                             tagset, cache_tags):
 879
 880         comb = m.d.comb
 881         sync = m.d.sync
 882
 883         # Get victim way from plru
 884         sync += r.store_way.eq(replace_way)
 885         # Force misses on that way while reloading that line
 886         cv = Signal(INDEX_BITS)
 887         comb += cv.eq(cache_valid_bits[req_index])
 888         comb += cv.bit_select(replace_way, 1).eq(0)
 889         sync += cache_valid_bits[req_index].eq(cv)
 890
 891         for i in range(NUM_WAYS):
 892             with m.If(i == replace_way):
 893                 comb += tagset.eq(cache_tags[r.store_index])
 894                 comb += write_tag(i, tagset, r.store_tag)
 895                 sync += cache_tags[r.store_index].eq(tagset)
 896
 897         sync += r.state.eq(State.WAIT_ACK)
 898
 899     def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
 900                              stbs_done, cache_valid_bits):
 901         comb = m.d.comb
 902         sync = m.d.sync
 903
 904         wb_in = self.wb_in
 905
 906         # Requests are all sent if stb is 0
 907         stbs_zero = Signal()
 908         comb += stbs_zero.eq(r.wb.stb == 0)
 909         comb += stbs_done.eq(stbs_zero)
 910
 911         # If we are still sending requests, was one accepted?
 912         with m.If(~wb_in.stall & ~stbs_zero):
 913             # That was the last word ?  # We are done sending.
 914             # Clear stb and set stbs_done # so we can handle
 915             # an eventual last ack on # the same cycle.
 916             with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
 917                 sync += Display("IS_LAST_ROW_ADDR " \
 918                                 "r.wb.addr:%x r.end_row_ix:%x " \
 919                                 "r.wb.stb:%x stbs_zero:%x " \
 920                                 "stbs_done:%x", r.wb.adr, \
 921                                 r.end_row_ix, r.wb.stb, \
 922                                 stbs_zero, stbs_done)
 923                 sync += r.wb.stb.eq(0)
 924                 comb += stbs_done.eq(1)
 925
 926             # Calculate the next row address
 927             rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
 928             comb += rarange.eq(
 929                      r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
 930                     )
 931             sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
 932                      rarange
 933                     )
 934             sync += Display("RARANGE r.req_adr:%x rarange:%x "
 935                             "stbs_zero:%x stbs_done:%x",
 936                             r.req_adr, rarange, stbs_zero, stbs_done)
 937
 938         # Incoming acks processing
 939         with m.If(wb_in.ack):
 940             sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
 941                             "stbs_done:%x",
 942                             wb_in.dat, stbs_zero, stbs_done)
 943
 944             sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
 945
 946             # Check for completion
 947             with m.If(stbs_done &
 948                       is_last_row(r.store_row, r.end_row_ix)):
 949                 # Complete wishbone cycle
 950                 sync += r.wb.cyc.eq(0)
 951                 sync += r.req_adr.eq(0) # be nice, clear addr
 952
 953                 # Cache line is now valid
 954                 cv = Signal(INDEX_BITS)
 955                 comb += cv.eq(cache_valid_bits[r.store_index])
 956                 comb += cv.bit_select(replace_way, 1).eq(
 957                          r.store_valid & ~inval_in
 958                         )
 959                 sync += cache_valid_bits[r.store_index].eq(cv)
 960
 961                 sync += r.state.eq(State.IDLE)
 962
 963             # not completed, move on to next request in row
 964             with m.Else():
 965                 # Increment store row counter
 966                 sync += r.store_row.eq(next_row(r.store_row))
 967
 968
 969     # Cache miss/reload synchronous machine
 970     def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
 971                     req_index, req_laddr, req_tag, replace_way,
 972                     cache_tags, access_ok, real_addr):
 973         comb = m.d.comb
 974         sync = m.d.sync
 975
 976         i_in, wb_in, m_in  = self.i_in, self.wb_in, self.m_in
 977         stall_in, flush_in = self.stall_in, self.flush_in
 978         inval_in           = self.inval_in
 979
 980 #       variable tagset    : cache_tags_set_t;
 981 #       variable stbs_done : boolean;
 982
 983         tagset    = Signal(TAG_RAM_WIDTH)
 984         stbs_done = Signal()
 985
 986         comb += r.wb.sel.eq(-1)
 987         comb += r.wb.adr.eq(r.req_adr[3:])
 988
 989         # Process cache invalidations
 990         with m.If(inval_in):
 991             for i in range(NUM_LINES):
 992                 sync += cache_valid_bits[i].eq(0)
 993             sync += r.store_valid.eq(0)
 994
 995         # Main state machine
 996         with m.Switch(r.state):
 997
 998             with m.Case(State.IDLE):
 999                 self.icache_miss_idle(
1000                     m, r, req_is_miss, req_laddr,
1001                     req_index, req_tag, replace_way,
1002                     real_addr
1003                 )
1004
1005             with m.Case(State.CLR_TAG, State.WAIT_ACK):
1006                 with m.If(r.state == State.CLR_TAG):
1007                     self.icache_miss_clr_tag(
1008                         m, r, replace_way,
1009                         cache_valid_bits, req_index,
1010                         tagset, cache_tags
1011                     )
1012
1013                 self.icache_miss_wait_ack(
1014                     m, r, replace_way, inval_in,
1015                     stbs_done, cache_valid_bits
1016                 )
1017
1018         # TLB miss and protection fault processing
1019         with m.If(flush_in | m_in.tlbld):
1020             sync += r.fetch_failed.eq(0)
1021         with m.Elif(i_in.req & ~access_ok & ~stall_in):
1022             sync += r.fetch_failed.eq(1)
1023
1024     #  icache_log: if LOG_LENGTH > 0 generate
1025     def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1026                    req_is_miss, req_is_hit, lway, wstate, r):
1027         comb = m.d.comb
1028         sync = m.d.sync
1029
1030         wb_in, i_out       = self.wb_in, self.i_out
1031         log_out, stall_out = self.log_out, self.stall_out
1032
1033 #         -- Output data to logger
1034 #         signal log_data    : std_ulogic_vector(53 downto 0);
1035 #     begin
1036 #         data_log: process(clk)
1037 #             variable lway: way_t;
1038 #             variable wstate: std_ulogic;
1039         # Output data to logger
1040         for i in range(LOG_LENGTH):
1041             # Output data to logger
1042             log_data = Signal(54)
1043             lway     = Signal(NUM_WAYS)
1044             wstate   = Signal()
1045
1046 #         begin
1047 #             if rising_edge(clk) then
1048 #                 lway := req_hit_way;
1049 #                 wstate := '0';
1050             sync += lway.eq(req_hit_way)
1051             sync += wstate.eq(0)
1052
1053 #                 if r.state /= IDLE then
1054 #                     wstate := '1';
1055 #                 end if;
1056             with m.If(r.state != State.IDLE):
1057                 sync += wstate.eq(1)
1058
1059 #                 log_data <= i_out.valid &
1060 #                             i_out.insn &
1061 #                             wishbone_in.ack &
1062 #                             r.wb.adr(5 downto 3) &
1063 #                             r.wb.stb & r.wb.cyc &
1064 #                             wishbone_in.stall &
1065 #                             stall_out &
1066 #                             r.fetch_failed &
1067 #                             r.hit_nia(5 downto 2) &
1068 #                             wstate &
1069 #                             std_ulogic_vector(to_unsigned(lway, 3)) &
1070 #                             req_is_hit & req_is_miss &
1071 #                             access_ok &
1072 #                             ra_valid;
1073             sync += log_data.eq(Cat(
1074                      ra_valid, access_ok, req_is_miss, req_is_hit,
1075                      lway, wstate, r.hit_nia[2:6],
1076                      r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1077                      r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1078                      i_out.valid
1079                     ))
1080 #             end if;
1081 #         end process;
1082 #         log_out <= log_data;
1083             comb += log_out.eq(log_data)
1084 #     end generate;
1085 # end;
1086
1087     def elaborate(self, platform):
1088
1089         m                = Module()
1090         comb             = m.d.comb
1091
1092         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1093         cache_tags       = CacheTagArray()
1094         cache_valid_bits = CacheValidBitsArray()
1095
1096 #     signal itlb_valids : tlb_valids_t;
1097 #     signal itlb_tags : tlb_tags_t;
1098 #     signal itlb_ptes : tlb_ptes_t;
1099 #     attribute ram_style of itlb_tags : signal is "distributed";
1100 #     attribute ram_style of itlb_ptes : signal is "distributed";
1101         itlb_valid_bits  = TLBValidBitsArray()
1102         itlb_tags        = TLBTagArray()
1103         itlb_ptes        = TLBPtesArray()
1104         # TODO to be passed to nmigen as ram attributes
1105         # attribute ram_style of itlb_tags : signal is "distributed";
1106         # attribute ram_style of itlb_ptes : signal is "distributed";
1107
1108 #     -- Privilege bit from PTE EAA field
1109 #     signal eaa_priv  : std_ulogic;
1110         # Privilege bit from PTE EAA field
1111         eaa_priv         = Signal()
1112
1113 #     signal r : reg_internal_t;
1114         r                = RegInternal()
1115
1116 #     -- Async signals on incoming request
1117 #     signal req_index   : index_t;
1118 #     signal req_row     : row_t;
1119 #     signal req_hit_way : way_t;
1120 #     signal req_tag     : cache_tag_t;
1121 #     signal req_is_hit  : std_ulogic;
1122 #     signal req_is_miss : std_ulogic;
1123 #     signal req_laddr   : std_ulogic_vector(63 downto 0);
1124         # Async signal on incoming request
1125         req_index        = Signal(NUM_LINES)
1126         req_row          = Signal(BRAM_ROWS)
1127         req_hit_way      = Signal(NUM_WAYS)
1128         req_tag          = Signal(TAG_BITS)
1129         req_is_hit       = Signal()
1130         req_is_miss      = Signal()
1131         req_laddr        = Signal(64)
1132
1133 #     signal tlb_req_index : tlb_index_t;
1134 #     signal real_addr     : std_ulogic_vector(
1135 #                             REAL_ADDR_BITS - 1 downto 0
1136 #                            );
1137 #     signal ra_valid      : std_ulogic;
1138 #     signal priv_fault    : std_ulogic;
1139 #     signal access_ok     : std_ulogic;
1140 #     signal use_previous  : std_ulogic;
1141         tlb_req_index    = Signal(TLB_SIZE)
1142         real_addr        = Signal(REAL_ADDR_BITS)
1143         ra_valid         = Signal()
1144         priv_fault       = Signal()
1145         access_ok        = Signal()
1146         use_previous     = Signal()
1147
1148 #     signal cache_out   : cache_ram_out_t;
1149         cache_out_row    = Signal(ROW_SIZE_BITS)
1150
1151 #     signal plru_victim : plru_out_t;
1152 #     signal replace_way : way_t;
1153         plru_victim      = PLRUOut()
1154         replace_way      = Signal(NUM_WAYS)
1155
1156         # call sub-functions putting everything together, using shared
1157         # signals established above
1158         self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
1159         self.maybe_plrus(m, r, plru_victim)
1160         self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
1161                          real_addr, itlb_valid_bits, ra_valid, eaa_priv,
1162                          priv_fault, access_ok)
1163         self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1164         self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
1165                          req_tag, real_addr, req_laddr, cache_valid_bits,
1166                          cache_tags, access_ok, req_is_hit, req_is_miss,
1167                          replace_way, plru_victim, cache_out_row)
1168         self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1169                         req_index, req_tag, real_addr)
1170         self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1171                          req_laddr, req_tag, replace_way, cache_tags,
1172                          access_ok, real_addr)
1173         #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1174         #                req_is_miss, req_is_hit, lway, wstate, r)
1175
1176         return m
1177
1178
1179 # icache_tb.vhdl
1180 #
1181 # library ieee;
1182 # use ieee.std_logic_1164.all;
1183 #
1184 # library work;
1185 # use work.common.all;
1186 # use work.wishbone_types.all;
1187 #
1188 # entity icache_tb is
1189 # end icache_tb;
1190 #
1191 # architecture behave of icache_tb is
1192 #     signal clk          : std_ulogic;
1193 #     signal rst          : std_ulogic;
1194 #
1195 #     signal i_out        : Fetch1ToIcacheType;
1196 #     signal i_in         : IcacheToDecode1Type;
1197 #
1198 #     signal m_out        : MmuToIcacheType;
1199 #
1200 #     signal wb_bram_in   : wishbone_master_out;
1201 #     signal wb_bram_out  : wishbone_slave_out;
1202 #
1203 #     constant clk_period : time := 10 ns;
1204 # begin
1205 #     icache0: entity work.icache
1206 #         generic map(
1207 #             LINE_SIZE => 64,
1208 #             NUM_LINES => 4
1209 #             )
1210 #         port map(
1211 #             clk => clk,
1212 #             rst => rst,
1213 #             i_in => i_out,
1214 #             i_out => i_in,
1215 #             m_in => m_out,
1216 #             stall_in => '0',
1217 #           flush_in => '0',
1218 #             inval_in => '0',
1219 #             wishbone_out => wb_bram_in,
1220 #             wishbone_in => wb_bram_out
1221 #             );
1222 #
1223 #     -- BRAM Memory slave
1224 #     bram0: entity work.wishbone_bram_wrapper
1225 #         generic map(
1226 #             MEMORY_SIZE   => 1024,
1227 #             RAM_INIT_FILE => "icache_test.bin"
1228 #             )
1229 #         port map(
1230 #             clk => clk,
1231 #             rst => rst,
1232 #             wishbone_in => wb_bram_in,
1233 #             wishbone_out => wb_bram_out
1234 #             );
1235 #
1236 #     clk_process: process
1237 #     begin
1238 #         clk <= '0';
1239 #         wait for clk_period/2;
1240 #         clk <= '1';
1241 #         wait for clk_period/2;
1242 #     end process;
1243 #
1244 #     rst_process: process
1245 #     begin
1246 #         rst <= '1';
1247 #         wait for 2*clk_period;
1248 #         rst <= '0';
1249 #         wait;
1250 #     end process;
1251 #
1252 #     stim: process
1253 #     begin
1254 #         i_out.req <= '0';
1255 #         i_out.nia <= (others => '0');
1256 #       i_out.stop_mark <= '0';
1257 #
1258 #         m_out.tlbld <= '0';
1259 #         m_out.tlbie <= '0';
1260 #         m_out.addr <= (others => '0');
1261 #         m_out.pte <= (others => '0');
1262 #
1263 #         wait until rising_edge(clk);
1264 #         wait until rising_edge(clk);
1265 #         wait until rising_edge(clk);
1266 #         wait until rising_edge(clk);
1267 #
1268 #         i_out.req <= '1';
1269 #         i_out.nia <= x"0000000000000004";
1270 #
1271 #         wait for 30*clk_period;
1272 #         wait until rising_edge(clk);
1273 #
1274 #         assert i_in.valid = '1' severity failure;
1275 #         assert i_in.insn = x"00000001"
1276 #           report "insn @" & to_hstring(i_out.nia) &
1277 #           "=" & to_hstring(i_in.insn) &
1278 #           " expected 00000001"
1279 #           severity failure;
1280 #
1281 #         i_out.req <= '0';
1282 #
1283 #         wait until rising_edge(clk);
1284 #
1285 #         -- hit
1286 #         i_out.req <= '1';
1287 #         i_out.nia <= x"0000000000000008";
1288 #         wait until rising_edge(clk);
1289 #         wait until rising_edge(clk);
1290 #         assert i_in.valid = '1' severity failure;
1291 #         assert i_in.insn = x"00000002"
1292 #           report "insn @" & to_hstring(i_out.nia) &
1293 #           "=" & to_hstring(i_in.insn) &
1294 #           " expected 00000002"
1295 #           severity failure;
1296 #         wait until rising_edge(clk);
1297 #
1298 #         -- another miss
1299 #         i_out.req <= '1';
1300 #         i_out.nia <= x"0000000000000040";
1301 #
1302 #         wait for 30*clk_period;
1303 #         wait until rising_edge(clk);
1304 #
1305 #         assert i_in.valid = '1' severity failure;
1306 #         assert i_in.insn = x"00000010"
1307 #           report "insn @" & to_hstring(i_out.nia) &
1308 #           "=" & to_hstring(i_in.insn) &
1309 #           " expected 00000010"
1310 #           severity failure;
1311 #
1312 #         -- test something that aliases
1313 #         i_out.req <= '1';
1314 #         i_out.nia <= x"0000000000000100";
1315 #         wait until rising_edge(clk);
1316 #         wait until rising_edge(clk);
1317 #         assert i_in.valid = '0' severity failure;
1318 #         wait until rising_edge(clk);
1319 #
1320 #         wait for 30*clk_period;
1321 #         wait until rising_edge(clk);
1322 #
1323 #         assert i_in.valid = '1' severity failure;
1324 #         assert i_in.insn = x"00000040"
1325 #           report "insn @" & to_hstring(i_out.nia) &
1326 #           "=" & to_hstring(i_in.insn) &
1327 #           " expected 00000040"
1328 #           severity failure;
1329 #
1330 #         i_out.req <= '0';
1331 #
1332 #         std.env.finish;
1333 #     end process;
1334 # end;
1335 def icache_sim(dut):
1336     i_out = dut.i_in
1337     i_in  = dut.i_out
1338     m_out = dut.m_in
1339
1340     yield i_in.valid.eq(0)
1341     yield i_out.priv_mode.eq(1)
1342     yield i_out.req.eq(0)
1343     yield i_out.nia.eq(0)
1344     yield i_out.stop_mark.eq(0)
1345     yield m_out.tlbld.eq(0)
1346     yield m_out.tlbie.eq(0)
1347     yield m_out.addr.eq(0)
1348     yield m_out.pte.eq(0)
1349     yield
1350     yield
1351     yield
1352     yield
1353     yield i_out.req.eq(1)
1354     yield i_out.nia.eq(Const(0x0000000000000004, 64))
1355     for i in range(30):
1356         yield
1357     yield
1358     valid = yield i_in.valid
1359     nia   = yield i_out.nia
1360     insn  = yield i_in.insn
1361     print(f"valid? {valid}")
1362     assert valid
1363     assert insn == 0x00000001, \
1364         "insn @%x=%x expected 00000001" % (nia, insn)
1365     yield i_out.req.eq(0)
1366     yield
1367
1368     # hit
1369     yield
1370     yield
1371     yield i_out.req.eq(1)
1372     yield i_out.nia.eq(Const(0x0000000000000008, 64))
1373     yield
1374     yield
1375     valid = yield i_in.valid
1376     nia   = yield i_in.nia
1377     insn  = yield i_in.insn
1378     assert valid
1379     assert insn == 0x00000002, \
1380         "insn @%x=%x expected 00000002" % (nia, insn)
1381     yield
1382
1383     # another miss
1384     yield i_out.req.eq(1)
1385     yield i_out.nia.eq(Const(0x0000000000000040, 64))
1386     for i in range(30):
1387         yield
1388     yield
1389     valid = yield i_in.valid
1390     nia   = yield i_out.nia
1391     insn  = yield i_in.insn
1392     assert valid
1393     assert insn == 0x00000010, \
1394         "insn @%x=%x expected 00000010" % (nia, insn)
1395
1396     # test something that aliases
1397     yield i_out.req.eq(1)
1398     yield i_out.nia.eq(Const(0x0000000000000100, 64))
1399     yield
1400     yield
1401     valid = yield i_in.valid
1402     assert ~valid
1403     for i in range(30):
1404         yield
1405     yield
1406     insn  = yield i_in.insn
1407     valid = yield i_in.valid
1408     insn  = yield i_in.insn
1409     assert valid
1410     assert insn == 0x00000040, \
1411          "insn @%x=%x expected 00000040" % (nia, insn)
1412     yield i_out.req.eq(0)
1413
1414
1415
1416 def test_icache(mem):
1417      dut    = ICache()
1418
1419      memory = Memory(width=64, depth=512, init=mem)
1420      sram   = SRAM(memory=memory, granularity=8)
1421
1422      m      = Module()
1423
1424      m.submodules.icache = dut
1425      m.submodules.sram   = sram
1426
1427      m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1428      m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1429      m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1430      m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1431      m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1432      m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1433
1434      m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1435      m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1436
1437      # nmigen Simulation
1438      sim = Simulator(m)
1439      sim.add_clock(1e-6)
1440
1441      sim.add_sync_process(wrap(icache_sim(dut)))
1442      with sim.write_vcd('test_icache.vcd'):
1443          sim.run()
1444
1445 if __name__ == '__main__':
1446     dut = ICache()
1447     vl = rtlil.convert(dut, ports=[])
1448     with open("test_icache.il", "w") as f:
1449         f.write(vl)
1450
1451     mem = []
1452     for i in range(512):
1453         mem.append((i*2)| ((i*2+1)<<32))
1454
1455     test_icache(mem)
1456