src/soc/experiment/icache.py

   1 """ICache
   2
   3 based on Anton Blanchard microwatt icache.vhdl
   4
   5 Set associative icache
   6
   7 TODO (in no specific order):
   8 * Add debug interface to inspect cache content
   9 * Add snoop/invalidate path
  10 * Add multi-hit error detection
  11 * Pipelined bus interface (wb or axi)
  12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
  13 * Add optimization: service hits on partially loaded lines
  14 * Add optimization: (maybe) interrupt reload on fluch/redirect
  15 * Check if playing with the geometry of the cache tags allow for more
  16   efficient use of distributed RAM and less logic/muxes. Currently we
  17   write TAG_BITS width which may not match full ram blocks and might
  18   cause muxes to be inferred for "partial writes".
  19 * Check if making the read size of PLRU a ROM helps utilization
  20
  21 """
  22 from enum import Enum, unique
  23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
  24 from nmigen.cli import main
  25 from nmigen.cli import rtlil
  26 from nmutil.iocontrol import RecordObject
  27 from nmutil.byterev import byte_reverse
  28 from nmutil.mask import Mask
  29 from nmigen.utils import log2_int
  30 from nmutil.util import Display
  31
  32 from soc.experiment.mem_types import (Fetch1ToICacheType,
  33                                       ICacheToDecode1Type,
  34                                       MMUToICacheType)
  35
  36 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
  37                                      WB_SEL_BITS, WBAddrType, WBDataType,
  38                                      WBSelType, WBMasterOut, WBSlaveOut,
  39                                      WBMasterOutVector, WBSlaveOutVector,
  40                                      WBIOMasterOut, WBIOSlaveOut)
  41
  42
  43 SIM            = 0
  44 LINE_SIZE      = 64
  45 # BRAM organisation: We never access more than wishbone_data_bits
  46 # at a time so to save resources we make the array only that wide,
  47 # and use consecutive indices for to make a cache "line"
  48 #
  49 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
  50 ROW_SIZE       = WB_DATA_BITS // 8
  51 # Number of lines in a set
  52 NUM_LINES      = 32
  53 # Number of ways
  54 NUM_WAYS       = 4
  55 # L1 ITLB number of entries (direct mapped)
  56 TLB_SIZE       = 64
  57 # L1 ITLB log_2(page_size)
  58 TLB_LG_PGSZ    = 12
  59 # Number of real address bits that we store
  60 REAL_ADDR_BITS = 56
  61 # Non-zero to enable log data collection
  62 LOG_LENGTH     = 0
  63
  64 ROW_SIZE_BITS  = ROW_SIZE * 8
  65 # ROW_PER_LINE is the number of row
  66 # (wishbone) transactions in a line
  67 ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
  68 # BRAM_ROWS is the number of rows in
  69 # BRAM needed to represent the full icache
  70 BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
  71 # INSN_PER_ROW is the number of 32bit
  72 # instructions per BRAM row
  73 INSN_PER_ROW   = ROW_SIZE_BITS // 32
  74
  75 # Bit fields counts in the address
  76 #
  77 # INSN_BITS is the number of bits to
  78 # select an instruction in a row
  79 INSN_BITS      = log2_int(INSN_PER_ROW)
  80 # ROW_BITS is the number of bits to
  81 # select a row
  82 ROW_BITS       = log2_int(BRAM_ROWS)
  83 # ROW_LINEBITS is the number of bits to
  84 # select a row within a line
  85 ROW_LINE_BITS   = log2_int(ROW_PER_LINE)
  86 # LINE_OFF_BITS is the number of bits for
  87 # the offset in a cache line
  88 LINE_OFF_BITS  = log2_int(LINE_SIZE)
  89 # ROW_OFF_BITS is the number of bits for
  90 # the offset in a row
  91 ROW_OFF_BITS   = log2_int(ROW_SIZE)
  92 # INDEX_BITS is the number of bits to
  93 # select a cache line
  94 INDEX_BITS     = log2_int(NUM_LINES)
  95 # SET_SIZE_BITS is the log base 2 of
  96 # the set size
  97 SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
  98 # TAG_BITS is the number of bits of
  99 # the tag part of the address
 100 TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
 101 # WAY_BITS is the number of bits to
 102 # select a way
 103 WAY_BITS       = log2_int(NUM_WAYS)
 104 TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
 105
 106 #     -- L1 ITLB.
 107 #     constant TLB_BITS : natural := log2(TLB_SIZE);
 108 #     constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
 109 #     constant TLB_PTE_BITS : natural := 64;
 110 TLB_BITS        = log2_int(TLB_SIZE)
 111 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
 112 TLB_PTE_BITS    = 64
 113
 114 # architecture rtl of icache is
 115 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
 116 #-- ROW_PER_LINE is the number of row (wishbone
 117 #-- transactions) in a line
 118 #constant ROW_PER_LINE  : natural := LINE_SIZE / ROW_SIZE;
 119 #-- BRAM_ROWS is the number of rows in BRAM
 120 #-- needed to represent the full
 121 #-- icache
 122 #constant BRAM_ROWS     : natural := NUM_LINES * ROW_PER_LINE;
 123 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
 124 #constant INSN_PER_ROW  : natural := ROW_SIZE_BITS / 32;
 125 #-- Bit fields counts in the address
 126 #
 127 #-- INSN_BITS is the number of bits to select
 128 #-- an instruction in a row
 129 #constant INSN_BITS     : natural := log2(INSN_PER_ROW);
 130 #-- ROW_BITS is the number of bits to select a row
 131 #constant ROW_BITS      : natural := log2(BRAM_ROWS);
 132 #-- ROW_LINEBITS is the number of bits to
 133 #-- select a row within a line
 134 #constant ROW_LINEBITS  : natural := log2(ROW_PER_LINE);
 135 #-- LINE_OFF_BITS is the number of bits for the offset
 136 #-- in a cache line
 137 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
 138 #-- ROW_OFF_BITS is the number of bits for the offset in a row
 139 #constant ROW_OFF_BITS  : natural := log2(ROW_SIZE);
 140 #-- INDEX_BITS is the number of bits to select a cache line
 141 #constant INDEX_BITS    : natural := log2(NUM_LINES);
 142 #-- SET_SIZE_BITS is the log base 2 of the set size
 143 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
 144 #-- TAG_BITS is the number of bits of the tag part of the address
 145 #constant TAG_BITS      : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
 146 #-- WAY_BITS is the number of bits to select a way
 147 #constant WAY_BITS     : natural := log2(NUM_WAYS);
 148
 149 #-- Example of layout for 32 lines of 64 bytes:
 150 #--
 151 #-- ..  tag    |index|  line  |
 152 #-- ..         |   row   |    |
 153 #-- ..         |     |   | |00| zero          (2)
 154 #-- ..         |     |   |-|  | INSN_BITS     (1)
 155 #-- ..         |     |---|    | ROW_LINEBITS  (3)
 156 #-- ..         |     |--- - --| LINE_OFF_BITS (6)
 157 #-- ..         |         |- --| ROW_OFF_BITS  (3)
 158 #-- ..         |----- ---|    | ROW_BITS      (8)
 159 #-- ..         |-----|        | INDEX_BITS    (5)
 160 #-- .. --------|              | TAG_BITS      (53)
 161    # Example of layout for 32 lines of 64 bytes:
 162    #
 163    # ..  tag    |index|  line  |
 164    # ..         |   row   |    |
 165    # ..         |     |   | |00| zero          (2)
 166    # ..         |     |   |-|  | INSN_BITS     (1)
 167    # ..         |     |---|    | ROW_LINEBITS  (3)
 168    # ..         |     |--- - --| LINE_OFF_BITS (6)
 169    # ..         |         |- --| ROW_OFF_BITS  (3)
 170    # ..         |----- ---|    | ROW_BITS      (8)
 171    # ..         |-----|        | INDEX_BITS    (5)
 172    # .. --------|              | TAG_BITS      (53)
 173
 174 #subtype row_t is integer range 0 to BRAM_ROWS-1;
 175 #subtype index_t is integer range 0 to NUM_LINES-1;
 176 #subtype way_t is integer range 0 to NUM_WAYS-1;
 177 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
 178 #
 179 #-- The cache data BRAM organized as described above for each way
 180 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
 181 #
 182 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
 183 #-- not handle a clean (commented) definition of the cache tags as a 3d
 184 #-- memory. For now, work around it by putting all the tags
 185 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
 186 #  type cache_tags_set_t is array(way_t) of cache_tag_t;
 187 #  type cache_tags_array_t is array(index_t) of cache_tags_set_t;
 188 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
 189 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
 190 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
 191 def CacheTagArray():
 192     return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
 193
 194 #-- The cache valid bits
 195 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
 196 #type cache_valids_t is array(index_t) of cache_way_valids_t;
 197 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
 198 def CacheValidBitsArray():
 199     return Array(Signal() for x in range(ROW_PER_LINE))
 200
 201 def RowPerLineValidArray():
 202     return Array(Signal() for x in range(ROW_PER_LINE))
 203
 204
 205 #attribute ram_style : string;
 206 #attribute ram_style of cache_tags : signal is "distributed";
 207    # TODO to be passed to nigmen as ram attributes
 208    # attribute ram_style : string;
 209    # attribute ram_style of cache_tags : signal is "distributed";
 210
 211
 212 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
 213 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
 214 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
 215 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
 216 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
 217 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
 218 def TLBValidBitsArray():
 219     return Array(Signal() for x in range(TLB_SIZE))
 220
 221 def TLBTagArray():
 222     return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
 223
 224 def TLBPTEArray():
 225     return Array(Signal(TLB_PTE_BITS) for x in range(TLB_SIZE))
 226
 227
 228 #-- Cache RAM interface
 229 #type cache_ram_out_t is array(way_t) of cache_row_t;
 230 # Cache RAM interface
 231 def CacheRamOut():
 232     return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
 233
 234 #-- PLRU output interface
 235 #type plru_out_t is array(index_t) of
 236 # std_ulogic_vector(WAY_BITS-1 downto 0);
 237 # PLRU output interface
 238 def PLRUOut():
 239     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 240
 241 # begin
 242 #
 243 #     assert LINE_SIZE mod ROW_SIZE = 0;
 244 #     assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
 245 #      severity FAILURE;
 246 #     assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
 247 #      severity FAILURE;
 248 #     assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
 249 #      severity FAILURE;
 250 #     assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
 251 #      severity FAILURE;
 252 #     assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
 253 #       report "geometry bits don't add up" severity FAILURE;
 254 #     assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
 255 #       report "geometry bits don't add up" severity FAILURE;
 256 #     assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
 257 #       report "geometry bits don't add up" severity FAILURE;
 258 #     assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
 259 #       report "geometry bits don't add up" severity FAILURE;
 260 #
 261 #     sim_debug: if SIM generate
 262 #     debug: process
 263 #     begin
 264 #       report "ROW_SIZE      = " & natural'image(ROW_SIZE);
 265 #       report "ROW_PER_LINE  = " & natural'image(ROW_PER_LINE);
 266 #       report "BRAM_ROWS     = " & natural'image(BRAM_ROWS);
 267 #       report "INSN_PER_ROW  = " & natural'image(INSN_PER_ROW);
 268 #       report "INSN_BITS     = " & natural'image(INSN_BITS);
 269 #       report "ROW_BITS      = " & natural'image(ROW_BITS);
 270 #       report "ROW_LINEBITS  = " & natural'image(ROW_LINEBITS);
 271 #       report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
 272 #       report "ROW_OFF_BITS  = " & natural'image(ROW_OFF_BITS);
 273 #       report "INDEX_BITS    = " & natural'image(INDEX_BITS);
 274 #       report "TAG_BITS      = " & natural'image(TAG_BITS);
 275 #       report "WAY_BITS      = " & natural'image(WAY_BITS);
 276 #       wait;
 277 #     end process;
 278 #     end generate;
 279
 280 # Cache reload state machine
 281 @unique
 282 class State(Enum):
 283     IDLE     = 0
 284     CLR_TAG  = 1
 285     WAIT_ACK = 2
 286
 287 #     type reg_internal_t is record
 288 #       -- Cache hit state (Latches for 1 cycle BRAM access)
 289 #       hit_way   : way_t;
 290 #       hit_nia   : std_ulogic_vector(63 downto 0);
 291 #       hit_smark : std_ulogic;
 292 #       hit_valid : std_ulogic;
 293 #
 294 #       -- Cache miss state (reload state machine)
 295 #         state            : state_t;
 296 #         wb               : wishbone_master_out;
 297 #       store_way        : way_t;
 298 #         store_index      : index_t;
 299 #       store_row        : row_t;
 300 #         store_tag        : cache_tag_t;
 301 #         store_valid      : std_ulogic;
 302 #         end_row_ix       : row_in_line_t;
 303 #         rows_valid       : row_per_line_valid_t;
 304 #
 305 #         -- TLB miss state
 306 #         fetch_failed     : std_ulogic;
 307 #     end record;
 308 class RegInternal(RecordObject):
 309     def __init__(self):
 310         super().__init__()
 311         # Cache hit state (Latches for 1 cycle BRAM access)
 312         self.hit_way      = Signal(NUM_WAYS)
 313         self.hit_nia      = Signal(64)
 314         self.hit_smark    = Signal()
 315         self.hit_valid    = Signal()
 316
 317         # Cache miss state (reload state machine)
 318         self.state        = Signal(State)
 319         self.wb           = WBMasterOut()
 320         self.store_way    = Signal(NUM_WAYS)
 321         self.store_index  = Signal(NUM_LINES)
 322         self.store_row    = Signal(BRAM_ROWS)
 323         self.store_tag    = Signal(TAG_BITS)
 324         self.store_valid  = Signal()
 325         self.end_row_ix   = Signal(ROW_LINE_BITS)
 326         self.rows_valid   = RowPerLineValidArray()
 327
 328         # TLB miss state
 329         self.fetch_failed = Signal()
 330
 331 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
 332 #
 333 # entity icache is
 334 #     generic (
 335 #         SIM : boolean := false;
 336 #         -- Line size in bytes
 337 #         LINE_SIZE : positive := 64;
 338 #         -- BRAM organisation: We never access more
 339 #         -- than wishbone_data_bits
 340 #         -- at a time so to save resources we make the
 341 #         -- array only that wide,
 342 #         -- and use consecutive indices for to make a cache "line"
 343 #         --
 344 #         -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
 345 #         -- so 64-bits)
 346 #         ROW_SIZE  : positive := wishbone_data_bits / 8;
 347 #         -- Number of lines in a set
 348 #         NUM_LINES : positive := 32;
 349 #         -- Number of ways
 350 #         NUM_WAYS  : positive := 4;
 351 #         -- L1 ITLB number of entries (direct mapped)
 352 #         TLB_SIZE : positive := 64;
 353 #         -- L1 ITLB log_2(page_size)
 354 #         TLB_LG_PGSZ : positive := 12;
 355 #         -- Number of real address bits that we store
 356 #         REAL_ADDR_BITS : positive := 56;
 357 #         -- Non-zero to enable log data collection
 358 #         LOG_LENGTH : natural := 0
 359 #         );
 360 #     port (
 361 #         clk          : in std_ulogic;
 362 #         rst          : in std_ulogic;
 363 #
 364 #         i_in         : in Fetch1ToIcacheType;
 365 #         i_out        : out IcacheToDecode1Type;
 366 #
 367 #         m_in         : in MmuToIcacheType;
 368 #
 369 #         stall_in     : in std_ulogic;
 370 #       stall_out    : out std_ulogic;
 371 #       flush_in     : in std_ulogic;
 372 #       inval_in     : in std_ulogic;
 373 #
 374 #         wishbone_out : out wishbone_master_out;
 375 #         wishbone_in  : in wishbone_slave_out;
 376 #
 377 #         log_out      : out std_ulogic_vector(53 downto 0)
 378 #         );
 379 # end entity icache;
 380 # 64 bit direct mapped icache. All instructions are 4B aligned.
 381 class ICache(Elaboratable):
 382     """64 bit direct mapped icache. All instructions are 4B aligned."""
 383     def __init__(self):
 384         self.i_in           = Fetch1ToICacheType()
 385         self.i_out          = ICacheToDecode1Type()
 386
 387         self.m_in           = MMUToICacheType()
 388
 389         self.stall_in       = Signal()
 390         self.stall_out      = Signal()
 391         self.flush_in       = Signal()
 392         self.inval_in       = Signal()
 393
 394         self.wb_out         = WBMasterOut()
 395         self.wb_in          = WBSlaveOut()
 396
 397         self.log_out        = Signal(54)
 398
 399 #     -- Return the cache line index (tag index) for an address
 400 #     function get_index(addr: std_ulogic_vector(63 downto 0))
 401 #      return index_t is
 402 #     begin
 403 #         return to_integer(unsigned(
 404 #          addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
 405 #         ));
 406 #     end;
 407     # Return the cache line index (tag index) for an address
 408     def get_index(addr):
 409         return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 410
 411 #     -- Return the cache row index (data memory) for an address
 412 #     function get_row(addr: std_ulogic_vector(63 downto 0))
 413 #       return row_t is
 414 #     begin
 415 #         return to_integer(unsigned(
 416 #          addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
 417 #         ));
 418 #     end;
 419     # Return the cache row index (data memory) for an address
 420     def get_row(addr):
 421         return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 422
 423 #     -- Return the index of a row within a line
 424 #     function get_row_of_line(row: row_t) return row_in_line_t is
 425 #       variable row_v : unsigned(ROW_BITS-1 downto 0);
 426 #     begin
 427 #       row_v := to_unsigned(row, ROW_BITS);
 428 #         return row_v(ROW_LINEBITS-1 downto 0);
 429 #     end;
 430     # Return the index of a row within a line
 431     def get_row_of_line(row):
 432         row[:ROW_LINE_BITS]
 433
 434 #     -- Returns whether this is the last row of a line
 435 #     function is_last_row_addr(addr: wishbone_addr_type;
 436 #      last: row_in_line_t
 437 #     )
 438 #      return boolean is
 439 #     begin
 440 #       return unsigned(
 441 #        addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
 442 #       ) = last;
 443 #     end;
 444     # Returns whether this is the last row of a line
 445     def is_last_row_addr(addr, last):
 446         return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 447
 448 #     -- Returns whether this is the last row of a line
 449 #     function is_last_row(row: row_t;
 450 #      last: row_in_line_t) return boolean is
 451 #     begin
 452 #       return get_row_of_line(row) = last;
 453 #     end;
 454     # Returns whether this is the last row of a line
 455     def is_last_row(row, last):
 456         return get_row_of_line(row) == last
 457
 458 #     -- Return the address of the next row in the current cache line
 459 #     function next_row_addr(addr: wishbone_addr_type)
 460 #       return std_ulogic_vector is
 461 #       variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
 462 #       variable result  : wishbone_addr_type;
 463 #     begin
 464 #       -- Is there no simpler way in VHDL to generate that 3 bits adder ?
 465 #       row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
 466 #       row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
 467 #       result := addr;
 468 #       result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
 469 #       return result;
 470 #     end;
 471     # Return the address of the next row in the current cache line
 472     def next_row_addr(addr):
 473         # TODO no idea what's going on here, looks like double assignments
 474         # overriding earlier assignments ??? Help please!
 475         pass
 476
 477 #     -- Return the next row in the current cache line. We use a dedicated
 478 #     -- function in order to limit the size of the generated adder to be
 479 #     -- only the bits within a cache line (3 bits with default settings)
 480 #     function next_row(row: row_t) return row_t is
 481 #       variable row_v   : std_ulogic_vector(ROW_BITS-1 downto 0);
 482 #       variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
 483 #       variable result  : std_ulogic_vector(ROW_BITS-1 downto 0);
 484 #     begin
 485 #       row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
 486 #       row_idx := row_v(ROW_LINEBITS-1 downto 0);
 487 #       row_v(ROW_LINEBITS-1 downto 0) :=
 488 #        std_ulogic_vector(unsigned(row_idx) + 1);
 489 #       return to_integer(unsigned(row_v));
 490 #     end;
 491     # Return the next row in the current cache line. We use a dedicated
 492     # function in order to limit the size of the generated adder to be
 493     # only the bits within a cache line (3 bits with default settings)
 494     def next_row(row):
 495         # TODO no idea what's going on here, looks like double assignments
 496         # overriding earlier assignments ??? Help please!
 497         pass
 498
 499 #     -- Read the instruction word for the given address in the
 500 #     -- current cache row
 501 #     function read_insn_word(addr: std_ulogic_vector(63 downto 0);
 502 #                           data: cache_row_t) return std_ulogic_vector is
 503 #       variable word: integer range 0 to INSN_PER_ROW-1;
 504 #     begin
 505 #         word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
 506 #       return data(31+word*32 downto word*32);
 507 #     end;
 508     # Read the instruction word for the given address
 509     # in the current cache row
 510     def read_insn_word(addr, data):
 511         word = addr[2:INSN_BITS+3]
 512         return data[word * 32:32 + word * 32]
 513
 514 #     -- Get the tag value from the address
 515 #     function get_tag(
 516 #      addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
 517 #     )
 518 #      return cache_tag_t is
 519 #     begin
 520 #         return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
 521 #     end;
 522     # Get the tag value from the address
 523     def get_tag(addr):
 524         return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 525
 526 #     -- Read a tag from a tag memory row
 527 #     function read_tag(way: way_t; tagset: cache_tags_set_t)
 528 #      return cache_tag_t is
 529 #     begin
 530 #       return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
 531 #     end;
 532     # Read a tag from a tag memory row
 533     def read_tag(way, tagset):
 534         return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
 535
 536 #     -- Write a tag to tag memory row
 537 #     procedure write_tag(way: in way_t;
 538 #      tagset: inout cache_tags_set_t; tag: cache_tag_t) is
 539 #     begin
 540 #       tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
 541 #     end;
 542     # Write a tag to tag memory row
 543     def write_tag(way, tagset, tag):
 544         tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag
 545
 546 #     -- Simple hash for direct-mapped TLB index
 547 #     function hash_ea(addr: std_ulogic_vector(63 downto 0))
 548 #      return tlb_index_t is
 549 #         variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
 550 #     begin
 551 #         hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
 552 #                 xor addr(
 553 #                  TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
 554 #                  TLB_LG_PGSZ + TLB_BITS
 555 #                 )
 556 #                 xor addr(
 557 #                  TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
 558 #                  TLB_LG_PGSZ + 2 * TLB_BITS
 559 #                 );
 560 #         return to_integer(unsigned(hash));
 561 #     end;
 562     # Simple hash for direct-mapped TLB index
 563     def hash_ea(addr):
 564         hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
 565                TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
 566               ] ^ addr[
 567                TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
 568               ]
 569         return hsh
 570
 571 #     -- Generate a cache RAM for each way
 572 #     rams: for i in 0 to NUM_WAYS-1 generate
 573 #       signal do_read  : std_ulogic;
 574 #       signal do_write : std_ulogic;
 575 #       signal rd_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
 576 #       signal wr_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
 577 #       signal dout     : cache_row_t;
 578 #       signal wr_sel   : std_ulogic_vector(ROW_SIZE-1 downto 0);
 579 #     begin
 580 #       way: entity work.cache_ram
 581 #           generic map (
 582 #               ROW_BITS => ROW_BITS,
 583 #               WIDTH => ROW_SIZE_BITS
 584 #               )
 585 #           port map (
 586 #               clk     => clk,
 587 #               rd_en   => do_read,
 588 #               rd_addr => rd_addr,
 589 #               rd_data => dout,
 590 #               wr_sel  => wr_sel,
 591 #               wr_addr => wr_addr,
 592 #               wr_data => wishbone_in.dat
 593 #               );
 594 #       process(all)
 595 #       begin
 596 #           do_read <= not (stall_in or use_previous);
 597 #           do_write <= '0';
 598 #           if wishbone_in.ack = '1' and replace_way = i then
 599 #               do_write <= '1';
 600 #           end if;
 601 #           cache_out(i) <= dout;
 602 #           rd_addr <=
 603 #            std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
 604 #           wr_addr <=
 605 #            std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
 606 #             for i in 0 to ROW_SIZE-1 loop
 607 #                 wr_sel(i) <= do_write;
 608 #             end loop;
 609 #       end process;
 610 #     end generate;
 611     def rams(self, m):
 612         comb = m.d.comb
 613
 614         do_read  = Signal()
 615         do_write = Signal()
 616         rd_addr  = Signal(ROW_BITS)
 617         wr_addr  = Signal(ROW_BITS)
 618         _d_out   = Signal(ROW_SIZE_BITS)
 619         wr_sel   = Signal(ROW_SIZE)
 620
 621         for i in range(NUM_WAYS):
 622             way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
 623             comb += way.rd_en.eq(do_read)
 624             comb += way.rd_addr.eq(rd_addr)
 625             comb += way.rd_data.eq(_d_out)
 626             comb += way.wr_sel.eq(wr_sel)
 627             comb += way.wr_add.eq(wr_addr)
 628             comb += way.wr_data.eq(wb_in.dat)
 629
 630             comb += do_read.eq(~(stall_in | use_previous))
 631             comb += do_write.eq(0)
 632
 633             with m.If(wb_in.ack & (replace_way == i)):
 634                 do_write.eq(1)
 635
 636             comb += cache_out[i].eq(_d_out)
 637             comb += rd_addr.eq(Signal(req_row))
 638             comb += wr_addr.eq(Signal(r.store_row))
 639             for j in range(ROW_SIZE):
 640                 comb += wr_sel[j].eq(do_write)
 641
 642 #     -- Generate PLRUs
 643 #     maybe_plrus: if NUM_WAYS > 1 generate
 644 #     begin
 645 #       plrus: for i in 0 to NUM_LINES-1 generate
 646 #           -- PLRU interface
 647 #           signal plru_acc    : std_ulogic_vector(WAY_BITS-1 downto 0);
 648 #           signal plru_acc_en : std_ulogic;
 649 #           signal plru_out    : std_ulogic_vector(WAY_BITS-1 downto 0);
 650 #
 651 #       begin
 652 #           plru : entity work.plru
 653 #               generic map (
 654 #                   BITS => WAY_BITS
 655 #                   )
 656 #               port map (
 657 #                   clk => clk,
 658 #                   rst => rst,
 659 #                   acc => plru_acc,
 660 #                   acc_en => plru_acc_en,
 661 #                   lru => plru_out
 662 #                   );
 663 #
 664 #           process(all)
 665 #           begin
 666 #               -- PLRU interface
 667 #               if get_index(r.hit_nia) = i then
 668 #                   plru_acc_en <= r.hit_valid;
 669 #               else
 670 #                   plru_acc_en <= '0';
 671 #               end if;
 672 #               plru_acc <=
 673 #                std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
 674 #               plru_victim(i) <= plru_out;
 675 #           end process;
 676 #       end generate;
 677 #     end generate;
 678     def maybe_plrus(self, m):
 679         comb += m.d.comb
 680
 681         with m.If(NUM_WAYS > 1):
 682             for i in range(NUM_LINES):
 683                 plru_acc    = Signal(WAY_BITS)
 684                 plru_acc_en = Signal()
 685                 plru_out    = Signal(WAY_BITS)
 686                 plru        = PLRU(WAY_BITS)
 687                 comb += plru.acc.eq(plru_acc)
 688                 comb += plru.acc_en.eq(plru_acc_en)
 689                 comb += plru.lru.eq(plru_out)
 690
 691                 # PLRU interface
 692                 with m.If(get_index(r.hit_nia) == i):
 693                     comb += plru.acc_en.eq(r.hit_valid)
 694
 695                 with m.Else():
 696                     comb += plru.acc_en.eq(0)
 697
 698                 comb += plru.acc.eq(r.hit_way)
 699                 comb += plru_victim[i].eq(plru.lru)
 700
 701 #     -- TLB hit detection and real address generation
 702 #     itlb_lookup : process(all)
 703 #         variable pte : tlb_pte_t;
 704 #         variable ttag : tlb_tag_t;
 705 #     begin
 706 #         tlb_req_index <= hash_ea(i_in.nia);
 707 #         pte := itlb_ptes(tlb_req_index);
 708 #         ttag := itlb_tags(tlb_req_index);
 709 #         if i_in.virt_mode = '1' then
 710 #             real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
 711 #                          i_in.nia(TLB_LG_PGSZ - 1 downto 0);
 712 #             if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
 713 #                 ra_valid <= itlb_valids(tlb_req_index);
 714 #             else
 715 #                 ra_valid <= '0';
 716 #             end if;
 717 #             eaa_priv <= pte(3);
 718 #         else
 719 #             real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
 720 #             ra_valid <= '1';
 721 #             eaa_priv <= '1';
 722 #         end if;
 723 #
 724 #         -- no IAMR, so no KUEP support for now
 725 #         priv_fault <= eaa_priv and not i_in.priv_mode;
 726 #         access_ok <= ra_valid and not priv_fault;
 727 #     end process;
 728     # TLB hit detection and real address generation
 729     def itlb_lookup(self, m):
 730         comb = m.d.comb
 731
 732         comb += tlb_req_index.eq(hash_ea(i_in.nia))
 733         comb += pte.eq(itlb_ptes[tlb_req_index])
 734         comb += ttag.eq(itlb_tags[tlb_req_index])
 735
 736         with m.If(i_in.virt_mode):
 737             comb += real_addr.eq(Cat(
 738                      i_in.nia[:TLB_LB_PGSZ],
 739                      pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
 740                     ))
 741
 742             with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
 743                 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
 744
 745             with m.Else():
 746                 comb += ra_valid.eq(0)
 747
 748         with m.Else():
 749             comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
 750             comb += ra_valid.eq(1)
 751             comb += eaa_priv.eq(1)
 752
 753         # No IAMR, so no KUEP support for now
 754         comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
 755         comb += access_ok.eq(ra_valid & ~priv_fault)
 756
 757 #     -- iTLB update
 758 #     itlb_update: process(clk)
 759 #         variable wr_index : tlb_index_t;
 760 #     begin
 761 #         if rising_edge(clk) then
 762 #             wr_index := hash_ea(m_in.addr);
 763 #             if rst = '1' or
 764 #              (m_in.tlbie = '1' and m_in.doall = '1') then
 765 #                 -- clear all valid bits
 766 #                 for i in tlb_index_t loop
 767 #                     itlb_valids(i) <= '0';
 768 #                 end loop;
 769 #             elsif m_in.tlbie = '1' then
 770 #                 -- clear entry regardless of hit or miss
 771 #                 itlb_valids(wr_index) <= '0';
 772 #             elsif m_in.tlbld = '1' then
 773 #                 itlb_tags(wr_index) <=
 774 #                  m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
 775 #                 itlb_ptes(wr_index) <= m_in.pte;
 776 #                 itlb_valids(wr_index) <= '1';
 777 #             end if;
 778 #         end if;
 779 #     end process;
 780     # iTLB update
 781     def itlb_update(self, m):
 782         sync = m.d.sync
 783
 784         wr_index = Signal(TLB_SIZE)
 785         sync += wr_index.eq(hash_ea(m_in.addr))
 786
 787         with m.If('''TODO rst in nmigen''' | (m_in.tlbie & m_in.doall)):
 788             # Clear all valid bits
 789             for i in range(TLB_SIZE):
 790                 sync += itlb_vlaids[i].eq(0)
 791
 792         with m.Elif(m_in.tlbie):
 793             # Clear entry regardless of hit or miss
 794             sync += itlb_valid_bits[wr_index].eq(0)
 795
 796         with m.Elif(m_in.tlbld):
 797             sync += itlb_tags[wr_index].eq(
 798                      m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
 799                     )
 800             sync += itlb_ptes[wr_index].eq(m_in.pte)
 801             sync += itlb_valid_bits[wr_index].eq(1)
 802
 803 #     -- Cache hit detection, output to fetch2 and other misc logic
 804 #     icache_comb : process(all)
 805     # Cache hit detection, output to fetch2 and other misc logic
 806     def icache_comb(self, m):
 807 #       variable is_hit  : std_ulogic;
 808 #       variable hit_way : way_t;
 809         comb = m.d.comb
 810
 811         is_hit  = Signal()
 812         hit_way = Signal(NUM_WAYS)
 813 #     begin
 814 #         -- i_in.sequential means that i_in.nia this cycle
 815 #         -- is 4 more than last cycle.  If we read more
 816 #         -- than 32 bits at a time, had a cache hit last
 817 #         -- cycle, and we don't want the first 32-bit chunk
 818 #         -- then we can keep the data we read last cycle
 819 #         -- and just use that.
 820 #         if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
 821 #             use_previous <= i_in.sequential and r.hit_valid;
 822 #         else
 823 #             use_previous <= '0';
 824 #         end if;
 825         # i_in.sequential means that i_in.nia this cycle is 4 more than
 826         # last cycle.  If we read more than 32 bits at a time, had a
 827         # cache hit last cycle, and we don't want the first 32-bit chunk
 828         # then we can keep the data we read last cycle and just use that.
 829         with m.If(i_in.nia[2:INSN_BITS+2] != 0):
 830             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 831
 832         with m.Else():
 833             comb += use_previous.eq(0)
 834
 835 #       -- Extract line, row and tag from request
 836 #         req_index <= get_index(i_in.nia);
 837 #         req_row <= get_row(i_in.nia);
 838 #         req_tag <= get_tag(real_addr);
 839         # Extract line, row and tag from request
 840         comb += req_index.eq(get_index(i_in.nia))
 841         comb += req_row.eq(get_row(i_in.nia))
 842         comb += req_tag.eq(get_tag(real_addr))
 843
 844 #       -- Calculate address of beginning of cache row, will be
 845 #       -- used for cache miss processing if needed
 846 #       req_laddr <=
 847 #        (63 downto REAL_ADDR_BITS => '0') &
 848 #        real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
 849 #        (ROW_OFF_BITS-1 downto 0 => '0');
 850         # Calculate address of beginning of cache row, will be
 851         # used for cache miss processing if needed
 852         comb += req_laddr.eq(Cat(
 853                  Const(0b0, ROW_OFF_BITS),
 854                  real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
 855                  Const(0, REAL_ADDR_BITS)
 856                 ))
 857
 858 #       -- Test if pending request is a hit on any way
 859 #       hit_way := 0;
 860 #       is_hit := '0';
 861 #       for i in way_t loop
 862 #           if i_in.req = '1' and
 863 #                 (cache_valids(req_index)(i) = '1' or
 864 #                  (r.state = WAIT_ACK and
 865 #                   req_index = r.store_index and
 866 #                   i = r.store_way and
 867 #                   r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
 868 #               if read_tag(i, cache_tags(req_index)) = req_tag then
 869 #                   hit_way := i;
 870 #                   is_hit := '1';
 871 #               end if;
 872 #           end if;
 873 #       end loop;
 874         # Test if pending request is a hit on any way
 875         for i in range(NUM_WAYS):
 876             with m.If(i_in.req &
 877                       (cache_valid_bits[req_index][i] |
 878                        ((r.state == State.WAIT_ACK)
 879                         & (req_index == r.store_index)
 880                         & (i == r.store_way)
 881                         & r.rows_valid[req_row % ROW_PER_LINE]))):
 882                 with m.If(read_tag(i, cahce_tags[req_index]) == req_tag):
 883                     comb += hit_way.eq(i)
 884                     comb += is_hit.eq(1)
 885
 886 #       -- Generate the "hit" and "miss" signals
 887 #       -- for the synchronous blocks
 888 #       if i_in.req = '1' and access_ok = '1' and flush_in = '0'
 889 #        and rst = '0' then
 890 #           req_is_hit  <= is_hit;
 891 #           req_is_miss <= not is_hit;
 892 #       else
 893 #           req_is_hit  <= '0';
 894 #           req_is_miss <= '0';
 895 #       end if;
 896 #       req_hit_way <= hit_way;
 897         # Generate the "hit" and "miss" signals
 898         # for the synchronous blocks
 899         with m.If(i_in.rq & access_ok & ~flush_in):
 900             comb += req_is_hit.eq(is_hit)
 901             comb += req_is_miss.eq(~is_hit)
 902
 903         with m.Else():
 904             comb += req_is_hit.eq(0)
 905             comb += req_is_miss.eq(0)
 906
 907 #       -- The way to replace on a miss
 908 #       if r.state = CLR_TAG then
 909 #           replace_way <=
 910 #            to_integer(unsigned(plru_victim(r.store_index)));
 911 #       else
 912 #           replace_way <= r.store_way;
 913 #       end if;
 914         # The way to replace on a miss
 915         with m.If(r.state == State.CLR_TAG):
 916             comb += replace_way.eq(plru_victim[r.store_index])
 917
 918         with m.Else():
 919             comb += replace_way.eq(r.store_way)
 920
 921 #       -- Output instruction from current cache row
 922 #       --
 923 #       -- Note: This is a mild violation of our design principle of
 924 #       -- having pipeline stages output from a clean latch. In this
 925 #       -- case we output the result of a mux. The alternative would
 926 #       -- be output an entire row which I prefer not to do just yet
 927 #       -- as it would force fetch2 to know about some of the cache
 928 #       -- geometry information.
 929 #       i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
 930 #       i_out.valid <= r.hit_valid;
 931 #       i_out.nia <= r.hit_nia;
 932 #       i_out.stop_mark <= r.hit_smark;
 933 #       i_out.fetch_failed <= r.fetch_failed;
 934         # Output instruction from current cache row
 935         #
 936         # Note: This is a mild violation of our design principle of
 937         # having pipeline stages output from a clean latch. In this
 938         # case we output the result of a mux. The alternative would
 939         # be output an entire row which I prefer not to do just yet
 940         # as it would force fetch2 to know about some of the cache
 941         # geometry information.
 942         comb += i_out.insn.eq(
 943                  read_insn_word(r.hit_nia, cache_out[r.hit_way])
 944                 )
 945         comb += i_out.valid.eq(r.hit_valid)
 946         comb += i_out.nia.eq(r.hit_nia)
 947         comb += i_out.stop_mark.eq(r.hit_smark)
 948         comb += i_out.fetch_failed.eq(r.fetch_failed)
 949
 950 #       -- Stall fetch1 if we have a miss on cache or TLB
 951 #       -- or a protection fault
 952 #       stall_out <= not (is_hit and access_ok);
 953         # Stall fetch1 if we have a miss on cache or TLB
 954         # or a protection fault
 955         comb += stall_out.eq(~(is_hit & access_ok))
 956
 957 #       -- Wishbone requests output (from the cache miss reload machine)
 958 #       wishbone_out <= r.wb;
 959         # Wishbone requests output (from the cache miss reload machine)
 960         comb += wb_out.eq(r.wb)
 961 #     end process;
 962
 963 #     -- Cache hit synchronous machine
 964 #     icache_hit : process(clk)
 965     # Cache hit synchronous machine
 966     def icache_hit(self, m):
 967         sync = m.d.sync
 968 #     begin
 969 #         if rising_edge(clk) then
 970 #             -- keep outputs to fetch2 unchanged on a stall
 971 #             -- except that flush or reset sets valid to 0
 972 #             -- If use_previous, keep the same data as last
 973 #             -- cycle and use the second half
 974 #             if stall_in = '1' or use_previous = '1' then
 975 #                 if rst = '1' or flush_in = '1' then
 976 #                     r.hit_valid <= '0';
 977 #             end if;
 978         # keep outputs to fetch2 unchanged on a stall
 979         # except that flush or reset sets valid to 0
 980         # If use_previous, keep the same data as last
 981         # cycle and use the second half
 982         with m.If(stall_in | use_previous):
 983             with m.If('''TODO rst nmigen''' | flush_in):
 984                 sync += r.hit_valid.eq(0)
 985 #             else
 986 #                 -- On a hit, latch the request for the next cycle,
 987 #                 -- when the BRAM data will be available on the
 988 #                 -- cache_out output of the corresponding way
 989 #                 r.hit_valid <= req_is_hit;
 990 #                 if req_is_hit = '1' then
 991 #                     r.hit_way <= req_hit_way;
 992         with m.Else():
 993             # On a hit, latch the request for the next cycle,
 994             # when the BRAM data will be available on the
 995             # cache_out output of the corresponding way
 996             sync += r.hit_valid.eq(req_is_hit)
 997
 998             with m.If(req_is_hit):
 999                 sync += r.hit_way.eq(req_hit_way)
1000
1001 #                     report "cache hit nia:" & to_hstring(i_in.nia) &
1002 #                         " IR:" & std_ulogic'image(i_in.virt_mode) &
1003 #                         " SM:" & std_ulogic'image(i_in.stop_mark) &
1004 #                         " idx:" & integer'image(req_index) &
1005 #                         " tag:" & to_hstring(req_tag) &
1006 #                         " way:" & integer'image(req_hit_way) &
1007 #                         " RA:" & to_hstring(real_addr);
1008                 print(f"cache hit nia:{i_in.nia}, " \
1009                       f"IR:{i_in.virt_mode}, " \
1010                       f"SM:{i_in.stop_mark}, idx:{req_index}, " \
1011                       f"tag:{req_tag}, way:{req_hit_way}, " \
1012                       f"RA:{real_addr}")
1013 #                 end if;
1014 #           end if;
1015 #             if stall_in = '0' then
1016 #                 -- Send stop marks and NIA down regardless of validity
1017 #                 r.hit_smark <= i_in.stop_mark;
1018 #                 r.hit_nia <= i_in.nia;
1019 #             end if;
1020         with m.If(~stall_in):
1021             # Send stop marks and NIA down regardless of validity
1022             sync += r.hit_smark.eq(i_in.stop_mark)
1023             sync += r.hit_nia.eq(i_in.nia)
1024 #       end if;
1025 #     end process;
1026
1027 #     -- Cache miss/reload synchronous machine
1028 #     icache_miss : process(clk)
1029     # Cache miss/reload synchronous machine
1030     def icache_miss(self, m):
1031         comb = m.d.comb
1032         sync = m.d.sync
1033
1034 #       variable tagset    : cache_tags_set_t;
1035 #       variable stbs_done : boolean;
1036
1037         tagset    = Signal(TAG_RAM_WIDTH)
1038         stbs_done = Signal()
1039
1040 #     begin
1041 #         if rising_edge(clk) then
1042 #           -- On reset, clear all valid bits to force misses
1043 #             if rst = '1' then
1044         # On reset, clear all valid bits to force misses
1045         with m.If('''TODO rst nmigen'''):
1046 #               for i in index_t loop
1047 #                   cache_valids(i) <= (others => '0');
1048 #               end loop;
1049             for i in Signal(NUM_LINES):
1050                 sync += cache_valid_bits[i].eq(~1)
1051
1052 #                 r.state <= IDLE;
1053 #                 r.wb.cyc <= '0';
1054 #                 r.wb.stb <= '0';
1055             sync += r.state.eq(State.IDLE)
1056             sync += r.wb.cyc.eq(0)
1057             sync += r.wb.stb.eq(0)
1058
1059 #               -- We only ever do reads on wishbone
1060 #               r.wb.dat <= (others => '0');
1061 #               r.wb.sel <= "11111111";
1062 #               r.wb.we  <= '0';
1063             # We only ever do reads on wishbone
1064             sync += r.wb.dat.eq(~1)
1065             sync += r.wb.sel.eq(Const(0b11111111, 8))
1066             sync += r.wb.we.eq(0)
1067
1068 #               -- Not useful normally but helps avoiding
1069 #               -- tons of sim warnings
1070 #               r.wb.adr <= (others => '0');
1071             # Not useful normally but helps avoiding tons of sim warnings
1072             sync += r.wb.adr.eq(~1)
1073
1074 #             else
1075         with m.Else():
1076 #                 -- Process cache invalidations
1077 #                 if inval_in = '1' then
1078 #                     for i in index_t loop
1079 #                         cache_valids(i) <= (others => '0');
1080 #                     end loop;
1081 #                     r.store_valid <= '0';
1082 #                 end if;
1083             # Process cache invalidations
1084             with m.If(inval_in):
1085                 for i in range(NUM_LINES):
1086                     sync += cache_valid_bits[i].eq(~1)
1087
1088                 sync += r.store_valid.eq(0)
1089
1090 #               -- Main state machine
1091 #               case r.state is
1092                 # Main state machine
1093                 with m.Switch(r.state):
1094
1095 #               when IDLE =>
1096                     with m.Case(State.IDLE):
1097 #                     -- Reset per-row valid flags,
1098 #                     -- only used in WAIT_ACK
1099 #                     for i in 0 to ROW_PER_LINE - 1 loop
1100 #                         r.rows_valid(i) <= '0';
1101 #                     end loop;
1102                         # Reset per-row valid flags,
1103                         # only used in WAIT_ACK
1104                         for i in range(ROW_PER_LINE):
1105                             sync += r.rows_valid[i].eq(0)
1106
1107 #                   -- We need to read a cache line
1108 #                   if req_is_miss = '1' then
1109 #                       report "cache miss nia:" & to_hstring(i_in.nia) &
1110 #                             " IR:" & std_ulogic'image(i_in.virt_mode) &
1111 #                           " SM:" & std_ulogic'image(i_in.stop_mark) &
1112 #                           " idx:" & integer'image(req_index) &
1113 #                           " way:" & integer'image(replace_way) &
1114 #                           " tag:" & to_hstring(req_tag) &
1115 #                             " RA:" & to_hstring(real_addr);
1116                         # We need to read a cache line
1117                         with m.If(req_is_miss):
1118                             print(f"cache miss nia:{i_in.nia} " \
1119                                   f"IR:{i_in.virt_mode} " \
1120                                   f"SM:{i_in.stop_mark} " \
1121                                   F"idx:{req_index} " \
1122                                   f"way:{replace_way} tag:{req_tag} " \
1123                                   f"RA:{real_addr}")
1124
1125 #                       -- Keep track of our index and way for
1126 #                       -- subsequent stores
1127 #                       r.store_index <= req_index;
1128 #                       r.store_row <= get_row(req_laddr);
1129 #                       r.store_tag <= req_tag;
1130 #                       r.store_valid <= '1';
1131 #                       r.end_row_ix <=
1132 #                        get_row_of_line(get_row(req_laddr)) - 1;
1133                             # Keep track of our index and way
1134                             # for subsequent stores
1135                             sync += r.store_index.eq(req_index)
1136                             sync += r.store_row.eq(get_row(req_laddr))
1137                             sync += r.store_tag.eq(req_tag)
1138                             sync += r.store_valid.eq(1)
1139                             sync += r.end_row_ix.eq(
1140                                      get_row_of_line(
1141                                       get_row(req_laddr)
1142                                      ) - 1
1143                                     )
1144
1145 #                       -- Prep for first wishbone read. We calculate the
1146 #                       -- address of the start of the cache line and
1147 #                       -- start the WB cycle.
1148 #                       r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1149 #                       r.wb.cyc <= '1';
1150 #                       r.wb.stb <= '1';
1151                             # Prep for first wishbone read.
1152                             # We calculate the
1153                             # address of the start of the cache line and
1154                             # start the WB cycle.
1155                             sync += r.wb.adr.eq(
1156                                      req_laddr[:r.wb.adr]
1157                                     )
1158
1159 #                       -- Track that we had one request sent
1160 #                       r.state <= CLR_TAG;
1161                             # Track that we had one request sent
1162                             sync += r.state.eq(State.CLR_TAG)
1163 #                   end if;
1164
1165 #               when CLR_TAG | WAIT_ACK =>
1166                     with m.Case(State.CLR_TAG, State.WAIT_ACK):
1167 #                     if r.state = CLR_TAG then
1168                         with m.If(r.state == State.CLR_TAG):
1169 #                         -- Get victim way from plru
1170 #                       r.store_way <= replace_way;
1171                             # Get victim way from plru
1172                             sync += r.store_way.eq(replace_way)
1173 #
1174 #                       -- Force misses on that way while
1175 #                       -- reloading that line
1176 #                       cache_valids(req_index)(replace_way) <= '0';
1177                             # Force misses on that way while
1178                             # realoading that line
1179                             sync += cache_valid_bits[
1180                                      req_index
1181                                     ][replace_way].eq(0)
1182
1183 #                       -- Store new tag in selected way
1184 #                       for i in 0 to NUM_WAYS-1 loop
1185 #                           if i = replace_way then
1186 #                               tagset := cache_tags(r.store_index);
1187 #                               write_tag(i, tagset, r.store_tag);
1188 #                               cache_tags(r.store_index) <= tagset;
1189 #                           end if;
1190 #                       end loop;
1191                             for i in range(NUM_WAYS):
1192                                 with m.If(i == replace_way):
1193                                     comb += tagset.eq(
1194                                              cache_tags[r.store_index]
1195                                             )
1196                                     sync += write_tag(
1197                                              i, tagset, r.store_tag
1198                                             )
1199                                     sync += cache_tags(r.store_index).eq(
1200                                              tagset
1201                                             )
1202
1203 #                         r.state <= WAIT_ACK;
1204                             sync += r.state.eq(State.WAIT_ACK)
1205 #                     end if;
1206
1207 #                   -- Requests are all sent if stb is 0
1208 #                   stbs_done := r.wb.stb = '0';
1209                         # Requests are all sent if stb is 0
1210                         comb += stbs_done.eq(r.wb.stb == 0)
1211
1212 #                   -- If we are still sending requests,
1213 #                   -- was one accepted ?
1214 #                   if wishbone_in.stall = '0' and not stbs_done then
1215                         # If we are still sending requests,
1216                         # was one accepted?
1217                         with m.If(~wb_in.stall & ~stbs_done):
1218 #                       -- That was the last word ? We are done sending.
1219 #                       -- Clear stb and set stbs_done so we can handle
1220 #                       -- an eventual last ack on the same cycle.
1221 #                       if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1222 #                           r.wb.stb <= '0';
1223 #                           stbs_done := true;
1224 #                       end if;
1225                             # That was the last word ?
1226                             # We are done sending.
1227                             # Clear stb and set stbs_done
1228                             # so we can handle
1229                             # an eventual last ack on
1230                             # the same cycle.
1231                             with m.If(is_last_row_addr(
1232                                       r.wb.adr, r.end_row_ix)):
1233                                 sync += r.wb.stb.eq(0)
1234                                 stbs_done.eq(1)
1235
1236 #                       -- Calculate the next row address
1237 #                       r.wb.adr <= next_row_addr(r.wb.adr);
1238                             # Calculate the next row address
1239                             sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
1240 #                   end if;
1241
1242 #                   -- Incoming acks processing
1243 #                   if wishbone_in.ack = '1' then
1244                         # Incoming acks processing
1245                         with m.If(wb_in.ack):
1246 #                         r.rows_valid(r.store_row mod ROW_PER_LINE)
1247 #                          <= '1';
1248                             sync += r.rows_valid[
1249                                      r.store_row & ROW_PER_LINE
1250                                     ].eq(1)
1251
1252 #                       -- Check for completion
1253 #                       if stbs_done and
1254 #                        is_last_row(r.store_row, r.end_row_ix) then
1255                             # Check for completion
1256                             with m.If(stbs_done & is_last_row(
1257                                       r.store_row, r.end_row_ix)):
1258 #                           -- Complete wishbone cycle
1259 #                           r.wb.cyc <= '0';
1260                                 # Complete wishbone cycle
1261                                 sync += r.wb.cyc.eq(0)
1262
1263 #                           -- Cache line is now valid
1264 #                           cache_valids(r.store_index)(replace_way) <=
1265 #                            r.store_valid and not inval_in;
1266                                 # Cache line is now valid
1267                                 sync += cache_valid_bits[
1268                                          r.store_index
1269                                         ][relace_way].eq(
1270                                          r.store_valid & ~inval_in
1271                                         )
1272
1273 #                           -- We are done
1274 #                           r.state <= IDLE;
1275                                 # We are done
1276                                 sync += r.state.eq(State.IDLE)
1277 #                       end if;
1278
1279 #                       -- Increment store row counter
1280 #                       r.store_row <= next_row(r.store_row);
1281                             # Increment store row counter
1282                             sync += store_row.eq(next_row(r.store_row))
1283 #                   end if;
1284 #               end case;
1285 #           end if;
1286 #
1287 #             -- TLB miss and protection fault processing
1288 #             if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1289 #                 r.fetch_failed <= '0';
1290 #             elsif i_in.req = '1' and access_ok = '0' and
1291 #              stall_in = '0' then
1292 #                 r.fetch_failed <= '1';
1293 #             end if;
1294             # TLB miss and protection fault processing
1295             with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
1296                 sync += r.fetch_failed.eq(0)
1297
1298             with m.Elif(i_in.req & ~access_ok & ~stall_in):
1299                 sync += r.fetch_failed.eq(1)
1300 #       end if;
1301 #     end process;
1302
1303 #     icache_log: if LOG_LENGTH > 0 generate
1304     def icache_log(self, m, log_out):
1305         comb = m.d.comb
1306         sync = m.d.sync
1307
1308 #         -- Output data to logger
1309 #         signal log_data    : std_ulogic_vector(53 downto 0);
1310 #     begin
1311 #         data_log: process(clk)
1312 #             variable lway: way_t;
1313 #             variable wstate: std_ulogic;
1314         # Output data to logger
1315         for i in range(LOG_LENGTH):
1316             # Output data to logger
1317             log_data = Signal(54)
1318             lway     = Signal(NUM_WAYS)
1319             wstate   = Signal()
1320
1321 #         begin
1322 #             if rising_edge(clk) then
1323 #                 lway := req_hit_way;
1324 #                 wstate := '0';
1325             comb += lway.eq(req_hit_way)
1326             comb += wstate.eq(0)
1327
1328 #                 if r.state /= IDLE then
1329 #                     wstate := '1';
1330 #                 end if;
1331             with m.If(r.state != State.IDLE):
1332                 comb += wstate.eq(1)
1333
1334 #                 log_data <= i_out.valid &
1335 #                             i_out.insn &
1336 #                             wishbone_in.ack &
1337 #                             r.wb.adr(5 downto 3) &
1338 #                             r.wb.stb & r.wb.cyc &
1339 #                             wishbone_in.stall &
1340 #                             stall_out &
1341 #                             r.fetch_failed &
1342 #                             r.hit_nia(5 downto 2) &
1343 #                             wstate &
1344 #                             std_ulogic_vector(to_unsigned(lway, 3)) &
1345 #                             req_is_hit & req_is_miss &
1346 #                             access_ok &
1347 #                             ra_valid;
1348             sync += log_data.eq(Cat(
1349                      ra_valid, access_ok, req_is_miss, req_is_hit,
1350                      lway, wstate, r.hit_nia[2:6],
1351                      r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1352                      r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1353                      i_out.valid
1354                     ))
1355 #             end if;
1356 #         end process;
1357 #         log_out <= log_data;
1358             comb += log_out.eq(log_data)
1359 #     end generate;
1360 # end;
1361
1362     def elaborate(self, platform):
1363         m = Module()
1364
1365         comb = m.d.comb
1366         sync = m.d.sync
1367
1368 #     -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1369 #     signal cache_tags   : cache_tags_array_t;
1370 #     signal cache_valids : cache_valids_t;
1371         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1372         cache_tags = CacheTagArray()
1373         cache_valid_bits = CacheValidBitsArray()
1374
1375 #     signal itlb_valids : tlb_valids_t;
1376 #     signal itlb_tags : tlb_tags_t;
1377 #     signal itlb_ptes : tlb_ptes_t;
1378 #     attribute ram_style of itlb_tags : signal is "distributed";
1379 #     attribute ram_style of itlb_ptes : signal is "distributed";
1380         itlb_valid_bits = TLBValidBitsArray()
1381         itlb_tags       = TLBTagArray()
1382         itlb_ptes       = TLBPTEArray()
1383         # TODO to be passed to nmigen as ram attributes
1384         # attribute ram_style of itlb_tags : signal is "distributed";
1385         # attribute ram_style of itlb_ptes : signal is "distributed";
1386
1387 #     -- Privilege bit from PTE EAA field
1388 #     signal eaa_priv  : std_ulogic;
1389         # Privilege bit from PTE EAA field
1390         eaa_priv        = Signal()
1391
1392 #     signal r : reg_internal_t;
1393         r = RegInternal()
1394
1395 #     -- Async signals on incoming request
1396 #     signal req_index   : index_t;
1397 #     signal req_row     : row_t;
1398 #     signal req_hit_way : way_t;
1399 #     signal req_tag     : cache_tag_t;
1400 #     signal req_is_hit  : std_ulogic;
1401 #     signal req_is_miss : std_ulogic;
1402 #     signal req_laddr   : std_ulogic_vector(63 downto 0);
1403         # Async signal on incoming request
1404         req_index     = Signal(NUM_LINES)
1405         req_row       = Signal(BRAM_ROWS)
1406         req_hit_way   = Signal(NUM_WAYS)
1407         req_tag       = Signal(TAG_BITS)
1408         req_is_hit    = Signal()
1409         req_is_miss   = Signal()
1410         req_laddr     = Signal(64)
1411
1412 #     signal tlb_req_index : tlb_index_t;
1413 #     signal real_addr     : std_ulogic_vector(
1414 #                             REAL_ADDR_BITS - 1 downto 0
1415 #                            );
1416 #     signal ra_valid      : std_ulogic;
1417 #     signal priv_fault    : std_ulogic;
1418 #     signal access_ok     : std_ulogic;
1419 #     signal use_previous  : std_ulogic;
1420         tlb_req_index = Signal(TLB_SIZE)
1421         real_addr     = Signal(REAL_ADDR_BITS)
1422         ra_valid      = Signal()
1423         priv_fault    = Signal()
1424         access_ok     = Signal()
1425         use_previous  = Signal()
1426
1427 #     signal cache_out   : cache_ram_out_t;
1428         cache_out     = CacheRamOut()
1429
1430 #     signal plru_victim : plru_out_t;
1431 #     signal replace_way : way_t;
1432         plru_victim   = PLRUOut()
1433         replace_way   = Signal(NUM_WAYS)
1434
1435
1436
1437 # icache_tb.vhdl
1438 #
1439 # library ieee;
1440 # use ieee.std_logic_1164.all;
1441 #
1442 # library work;
1443 # use work.common.all;
1444 # use work.wishbone_types.all;
1445 #
1446 # entity icache_tb is
1447 # end icache_tb;
1448 #
1449 # architecture behave of icache_tb is
1450 #     signal clk          : std_ulogic;
1451 #     signal rst          : std_ulogic;
1452 #
1453 #     signal i_out        : Fetch1ToIcacheType;
1454 #     signal i_in         : IcacheToDecode1Type;
1455 #
1456 #     signal m_out        : MmuToIcacheType;
1457 #
1458 #     signal wb_bram_in   : wishbone_master_out;
1459 #     signal wb_bram_out  : wishbone_slave_out;
1460 #
1461 #     constant clk_period : time := 10 ns;
1462 # begin
1463 #     icache0: entity work.icache
1464 #         generic map(
1465 #             LINE_SIZE => 64,
1466 #             NUM_LINES => 4
1467 #             )
1468 #         port map(
1469 #             clk => clk,
1470 #             rst => rst,
1471 #             i_in => i_out,
1472 #             i_out => i_in,
1473 #             m_in => m_out,
1474 #             stall_in => '0',
1475 #           flush_in => '0',
1476 #             inval_in => '0',
1477 #             wishbone_out => wb_bram_in,
1478 #             wishbone_in => wb_bram_out
1479 #             );
1480 #
1481 #     -- BRAM Memory slave
1482 #     bram0: entity work.wishbone_bram_wrapper
1483 #         generic map(
1484 #             MEMORY_SIZE   => 1024,
1485 #             RAM_INIT_FILE => "icache_test.bin"
1486 #             )
1487 #         port map(
1488 #             clk => clk,
1489 #             rst => rst,
1490 #             wishbone_in => wb_bram_in,
1491 #             wishbone_out => wb_bram_out
1492 #             );
1493 #
1494 #     clk_process: process
1495 #     begin
1496 #         clk <= '0';
1497 #         wait for clk_period/2;
1498 #         clk <= '1';
1499 #         wait for clk_period/2;
1500 #     end process;
1501 #
1502 #     rst_process: process
1503 #     begin
1504 #         rst <= '1';
1505 #         wait for 2*clk_period;
1506 #         rst <= '0';
1507 #         wait;
1508 #     end process;
1509 #
1510 #     stim: process
1511 #     begin
1512 #         i_out.req <= '0';
1513 #         i_out.nia <= (others => '0');
1514 #       i_out.stop_mark <= '0';
1515 #
1516 #         m_out.tlbld <= '0';
1517 #         m_out.tlbie <= '0';
1518 #         m_out.addr <= (others => '0');
1519 #         m_out.pte <= (others => '0');
1520 #
1521 #         wait until rising_edge(clk);
1522 #         wait until rising_edge(clk);
1523 #         wait until rising_edge(clk);
1524 #         wait until rising_edge(clk);
1525 #
1526 #         i_out.req <= '1';
1527 #         i_out.nia <= x"0000000000000004";
1528 #
1529 #         wait for 30*clk_period;
1530 #         wait until rising_edge(clk);
1531 #
1532 #         assert i_in.valid = '1' severity failure;
1533 #         assert i_in.insn = x"00000001"
1534 #           report "insn @" & to_hstring(i_out.nia) &
1535 #           "=" & to_hstring(i_in.insn) &
1536 #           " expected 00000001"
1537 #           severity failure;
1538 #
1539 #         i_out.req <= '0';
1540 #
1541 #         wait until rising_edge(clk);
1542 #
1543 #         -- hit
1544 #         i_out.req <= '1';
1545 #         i_out.nia <= x"0000000000000008";
1546 #         wait until rising_edge(clk);
1547 #         wait until rising_edge(clk);
1548 #         assert i_in.valid = '1' severity failure;
1549 #         assert i_in.insn = x"00000002"
1550 #           report "insn @" & to_hstring(i_out.nia) &
1551 #           "=" & to_hstring(i_in.insn) &
1552 #           " expected 00000002"
1553 #           severity failure;
1554 #         wait until rising_edge(clk);
1555 #
1556 #         -- another miss
1557 #         i_out.req <= '1';
1558 #         i_out.nia <= x"0000000000000040";
1559 #
1560 #         wait for 30*clk_period;
1561 #         wait until rising_edge(clk);
1562 #
1563 #         assert i_in.valid = '1' severity failure;
1564 #         assert i_in.insn = x"00000010"
1565 #           report "insn @" & to_hstring(i_out.nia) &
1566 #           "=" & to_hstring(i_in.insn) &
1567 #           " expected 00000010"
1568 #           severity failure;
1569 #
1570 #         -- test something that aliases
1571 #         i_out.req <= '1';
1572 #         i_out.nia <= x"0000000000000100";
1573 #         wait until rising_edge(clk);
1574 #         wait until rising_edge(clk);
1575 #         assert i_in.valid = '0' severity failure;
1576 #         wait until rising_edge(clk);
1577 #
1578 #         wait for 30*clk_period;
1579 #         wait until rising_edge(clk);
1580 #
1581 #         assert i_in.valid = '1' severity failure;
1582 #         assert i_in.insn = x"00000040"
1583 #           report "insn @" & to_hstring(i_out.nia) &
1584 #           "=" & to_hstring(i_in.insn) &
1585 #           " expected 00000040"
1586 #           severity failure;
1587 #
1588 #         i_out.req <= '0';
1589 #
1590 #         std.env.finish;
1591 #     end process;
1592 # end;
1593 def icache_sim(dut):
1594     i_out, i_in, m_out, m_in = dut.i_out, dut.i_in, dut.m_out, dut.m_in
1595
1596     yield i_out.req.eq(0)
1597     yield i_out.nia.eq(~1)
1598     yield i_out.stop_mark.eq(0)
1599     yield m_out.tlbld.eq(0)
1600     yield m_out.tlbie.eq(0)
1601     yield m_out.addr.eq(~1)
1602     yield m_out.pte.eq(~1)
1603     yield
1604     yield
1605     yield
1606     yield
1607     yield i_out.req.eq(1)
1608     yield i_out.nia.eq(Const(0x0000000000000004, 64))
1609     for i in range(30):
1610         yield
1611     yield
1612     assert i_in.valid
1613     assert i_in.insn == Const(0x00000001, 32), \
1614         ("insn @%x=%x expected 00000001" % i_out.nia, i_in.insn)
1615     yield i_out.req.eq(0)
1616     yield
1617
1618     # hit
1619     yield i_out.req.eq(1)
1620     yield i_out.nia.eq(Const(0x0000000000000008, 64))
1621     yield
1622     yield
1623     assert i_in.valid
1624     assert i_in.insn == Const(0x00000002, 32), \
1625         ("insn @%x=%x expected 00000002" % i_out.nia, i_in.insn)
1626     yield
1627
1628     # another miss
1629     yield i_out.req(1)
1630     yield i_out.nia.eq(Const(0x0000000000000040, 64))
1631     for i in range(30):
1632         yield
1633     yield
1634     assert i_in.valid
1635     assert i_in.insn == Const(0x00000010, 32), \
1636         ("insn @%x=%x expected 00000010" % i_out.nia, i_in.insn)
1637
1638     # test something that aliases
1639     yield i_out.req.eq(1)
1640     yield i_out.nia.eq(Const(0x0000000000000100, 64))
1641     yield
1642     yield
1643     assert i_in.valid
1644     for i in range(30):
1645         yield
1646     yield
1647     assert i_in.valid
1648     assert i_in.insn == Const(0x00000040, 32), \
1649          ("insn @%x=%x expected 00000040" % i_out.nia, i_in.insn)
1650     yield i_out.req.eq(0)
1651
1652
1653 def test_icache():
1654     dut = ICache()
1655     vl = rtlil.convert(dut, ports=[])
1656     with open("test_icache.il", "w") as f:
1657         f.write(vl)
1658
1659     #run_simulation(dut, icache_sim(), vcd_name='test_icache.vcd')
1660
1661 if __name__ == '__main__':
1662     test_icache()