src/soc/experiment/icache.py

   1 """ICache
   2
   3 based on Anton Blanchard microwatt icache.vhdl
   4
   5 Set associative icache
   6
   7 TODO (in no specific order):
   8 * Add debug interface to inspect cache content
   9 * Add snoop/invalidate path
  10 * Add multi-hit error detection
  11 * Pipelined bus interface (wb or axi)
  12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
  13 * Add optimization: service hits on partially loaded lines
  14 * Add optimization: (maybe) interrupt reload on fluch/redirect
  15 * Check if playing with the geometry of the cache tags allow for more
  16   efficient use of distributed RAM and less logic/muxes. Currently we
  17   write TAG_BITS width which may not match full ram blocks and might
  18   cause muxes to be inferred for "partial writes".
  19 * Check if making the read size of PLRU a ROM helps utilization
  20
  21 """
  22 from enum import Enum, unique
  23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
  24 from nmigen.cli import main
  25 from nmigen.cli import rtlil
  26 from nmutil.iocontrol import RecordObject
  27 from nmutil.byterev import byte_reverse
  28 from nmutil.mask import Mask
  29 from nmigen.utils import log2_int
  30 from nmutil.util import Display
  31
  32 from soc.experiment.mem_types import (Fetch1ToICacheType,
  33                                       ICacheToDecode1Type,
  34                                       MMUToICacheType)
  35
  36 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
  37                                      WB_SEL_BITS, WBAddrType, WBDataType,
  38                                      WBSelType, WBMasterOut, WBSlaveOut,
  39                                      WBMasterOutVector, WBSlaveOutVector,
  40                                      WBIOMasterOut, WBIOSlaveOut)
  41
  42 from soc.experiment.cache_ram import CacheRam
  43 from soc.experiment.plru import PLRU
  44
  45 # for test
  46 from nmigen_soc.wishbone.sram import SRAM
  47 from nmigen import Memory
  48 from nmigen.cli import rtlil
  49 if True:
  50     from nmigen.back.pysim import Simulator, Delay, Settle
  51 else:
  52     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  53 from nmutil.util import wrap
  54
  55
  56
  57 SIM            = 0
  58 LINE_SIZE      = 64
  59 # BRAM organisation: We never access more than wishbone_data_bits
  60 # at a time so to save resources we make the array only that wide,
  61 # and use consecutive indices for to make a cache "line"
  62 #
  63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
  64 ROW_SIZE       = WB_DATA_BITS // 8
  65 # Number of lines in a set
  66 NUM_LINES      = 32
  67 # Number of ways
  68 NUM_WAYS       = 4
  69 # L1 ITLB number of entries (direct mapped)
  70 TLB_SIZE       = 64
  71 # L1 ITLB log_2(page_size)
  72 TLB_LG_PGSZ    = 12
  73 # Number of real address bits that we store
  74 REAL_ADDR_BITS = 56
  75 # Non-zero to enable log data collection
  76 LOG_LENGTH     = 0
  77
  78 ROW_SIZE_BITS  = ROW_SIZE * 8
  79 # ROW_PER_LINE is the number of row
  80 # (wishbone) transactions in a line
  81 ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
  82 # BRAM_ROWS is the number of rows in
  83 # BRAM needed to represent the full icache
  84 BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
  85 # INSN_PER_ROW is the number of 32bit
  86 # instructions per BRAM row
  87 INSN_PER_ROW   = ROW_SIZE_BITS // 32
  88
  89 # Bit fields counts in the address
  90 #
  91 # INSN_BITS is the number of bits to
  92 # select an instruction in a row
  93 INSN_BITS      = log2_int(INSN_PER_ROW)
  94 # ROW_BITS is the number of bits to
  95 # select a row
  96 ROW_BITS       = log2_int(BRAM_ROWS)
  97 # ROW_LINEBITS is the number of bits to
  98 # select a row within a line
  99 ROW_LINE_BITS   = log2_int(ROW_PER_LINE)
 100 # LINE_OFF_BITS is the number of bits for
 101 # the offset in a cache line
 102 LINE_OFF_BITS  = log2_int(LINE_SIZE)
 103 # ROW_OFF_BITS is the number of bits for
 104 # the offset in a row
 105 ROW_OFF_BITS   = log2_int(ROW_SIZE)
 106 # INDEX_BITS is the number of bits to
 107 # select a cache line
 108 INDEX_BITS     = log2_int(NUM_LINES)
 109 # SET_SIZE_BITS is the log base 2 of
 110 # the set size
 111 SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
 112 # TAG_BITS is the number of bits of
 113 # the tag part of the address
 114 TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
 115 # WAY_BITS is the number of bits to
 116 # select a way
 117 WAY_BITS       = log2_int(NUM_WAYS)
 118 TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
 119
 120 #     -- L1 ITLB.
 121 #     constant TLB_BITS : natural := log2(TLB_SIZE);
 122 #     constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
 123 #     constant TLB_PTE_BITS : natural := 64;
 124 TLB_BITS        = log2_int(TLB_SIZE)
 125 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
 126 TLB_PTE_BITS    = 64
 127
 128 # architecture rtl of icache is
 129 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
 130 #-- ROW_PER_LINE is the number of row (wishbone
 131 #-- transactions) in a line
 132 #constant ROW_PER_LINE  : natural := LINE_SIZE / ROW_SIZE;
 133 #-- BRAM_ROWS is the number of rows in BRAM
 134 #-- needed to represent the full
 135 #-- icache
 136 #constant BRAM_ROWS     : natural := NUM_LINES * ROW_PER_LINE;
 137 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
 138 #constant INSN_PER_ROW  : natural := ROW_SIZE_BITS / 32;
 139 #-- Bit fields counts in the address
 140 #
 141 #-- INSN_BITS is the number of bits to select
 142 #-- an instruction in a row
 143 #constant INSN_BITS     : natural := log2(INSN_PER_ROW);
 144 #-- ROW_BITS is the number of bits to select a row
 145 #constant ROW_BITS      : natural := log2(BRAM_ROWS);
 146 #-- ROW_LINEBITS is the number of bits to
 147 #-- select a row within a line
 148 #constant ROW_LINEBITS  : natural := log2(ROW_PER_LINE);
 149 #-- LINE_OFF_BITS is the number of bits for the offset
 150 #-- in a cache line
 151 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
 152 #-- ROW_OFF_BITS is the number of bits for the offset in a row
 153 #constant ROW_OFF_BITS  : natural := log2(ROW_SIZE);
 154 #-- INDEX_BITS is the number of bits to select a cache line
 155 #constant INDEX_BITS    : natural := log2(NUM_LINES);
 156 #-- SET_SIZE_BITS is the log base 2 of the set size
 157 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
 158 #-- TAG_BITS is the number of bits of the tag part of the address
 159 #constant TAG_BITS      : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
 160 #-- WAY_BITS is the number of bits to select a way
 161 #constant WAY_BITS     : natural := log2(NUM_WAYS);
 162
 163 #-- Example of layout for 32 lines of 64 bytes:
 164 #--
 165 #-- ..  tag    |index|  line  |
 166 #-- ..         |   row   |    |
 167 #-- ..         |     |   | |00| zero          (2)
 168 #-- ..         |     |   |-|  | INSN_BITS     (1)
 169 #-- ..         |     |---|    | ROW_LINEBITS  (3)
 170 #-- ..         |     |--- - --| LINE_OFF_BITS (6)
 171 #-- ..         |         |- --| ROW_OFF_BITS  (3)
 172 #-- ..         |----- ---|    | ROW_BITS      (8)
 173 #-- ..         |-----|        | INDEX_BITS    (5)
 174 #-- .. --------|              | TAG_BITS      (53)
 175    # Example of layout for 32 lines of 64 bytes:
 176    #
 177    # ..  tag    |index|  line  |
 178    # ..         |   row   |    |
 179    # ..         |     |   | |00| zero          (2)
 180    # ..         |     |   |-|  | INSN_BITS     (1)
 181    # ..         |     |---|    | ROW_LINEBITS  (3)
 182    # ..         |     |--- - --| LINE_OFF_BITS (6)
 183    # ..         |         |- --| ROW_OFF_BITS  (3)
 184    # ..         |----- ---|    | ROW_BITS      (8)
 185    # ..         |-----|        | INDEX_BITS    (5)
 186    # .. --------|              | TAG_BITS      (53)
 187
 188 #subtype row_t is integer range 0 to BRAM_ROWS-1;
 189 #subtype index_t is integer range 0 to NUM_LINES-1;
 190 #subtype way_t is integer range 0 to NUM_WAYS-1;
 191 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
 192 #
 193 #-- The cache data BRAM organized as described above for each way
 194 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
 195 #
 196 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
 197 #-- not handle a clean (commented) definition of the cache tags as a 3d
 198 #-- memory. For now, work around it by putting all the tags
 199 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
 200 #  type cache_tags_set_t is array(way_t) of cache_tag_t;
 201 #  type cache_tags_array_t is array(index_t) of cache_tags_set_t;
 202 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
 203 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
 204 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
 205 def CacheTagArray():
 206     return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
 207
 208 #-- The cache valid bits
 209 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
 210 #type cache_valids_t is array(index_t) of cache_way_valids_t;
 211 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
 212 def CacheValidBitsArray():
 213     return Array(Signal() for x in range(ROW_PER_LINE))
 214
 215 def RowPerLineValidArray():
 216     return Array(Signal() for x in range(ROW_PER_LINE))
 217
 218
 219 #attribute ram_style : string;
 220 #attribute ram_style of cache_tags : signal is "distributed";
 221    # TODO to be passed to nigmen as ram attributes
 222    # attribute ram_style : string;
 223    # attribute ram_style of cache_tags : signal is "distributed";
 224
 225
 226 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
 227 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
 228 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
 229 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
 230 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
 231 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
 232 def TLBValidBitsArray():
 233     return Array(Signal() for x in range(TLB_SIZE))
 234
 235 def TLBTagArray():
 236     return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
 237
 238 def TLBPTEArray():
 239     return Array(Signal(TLB_PTE_BITS) for x in range(TLB_SIZE))
 240
 241
 242 #-- Cache RAM interface
 243 #type cache_ram_out_t is array(way_t) of cache_row_t;
 244 # Cache RAM interface
 245 def CacheRamOut():
 246     return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
 247
 248 #-- PLRU output interface
 249 #type plru_out_t is array(index_t) of
 250 # std_ulogic_vector(WAY_BITS-1 downto 0);
 251 # PLRU output interface
 252 def PLRUOut():
 253     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 254
 255 # begin
 256 #
 257 #     assert LINE_SIZE mod ROW_SIZE = 0;
 258 #     assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
 259 #      severity FAILURE;
 260 #     assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
 261 #      severity FAILURE;
 262 #     assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
 263 #      severity FAILURE;
 264 #     assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
 265 #      severity FAILURE;
 266 #     assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
 267 #       report "geometry bits don't add up" severity FAILURE;
 268 #     assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
 269 #       report "geometry bits don't add up" severity FAILURE;
 270 #     assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
 271 #       report "geometry bits don't add up" severity FAILURE;
 272 #     assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
 273 #       report "geometry bits don't add up" severity FAILURE;
 274 #
 275 #     sim_debug: if SIM generate
 276 #     debug: process
 277 #     begin
 278 #       report "ROW_SIZE      = " & natural'image(ROW_SIZE);
 279 #       report "ROW_PER_LINE  = " & natural'image(ROW_PER_LINE);
 280 #       report "BRAM_ROWS     = " & natural'image(BRAM_ROWS);
 281 #       report "INSN_PER_ROW  = " & natural'image(INSN_PER_ROW);
 282 #       report "INSN_BITS     = " & natural'image(INSN_BITS);
 283 #       report "ROW_BITS      = " & natural'image(ROW_BITS);
 284 #       report "ROW_LINEBITS  = " & natural'image(ROW_LINEBITS);
 285 #       report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
 286 #       report "ROW_OFF_BITS  = " & natural'image(ROW_OFF_BITS);
 287 #       report "INDEX_BITS    = " & natural'image(INDEX_BITS);
 288 #       report "TAG_BITS      = " & natural'image(TAG_BITS);
 289 #       report "WAY_BITS      = " & natural'image(WAY_BITS);
 290 #       wait;
 291 #     end process;
 292 #     end generate;
 293
 294 # Cache reload state machine
 295 @unique
 296 class State(Enum):
 297     IDLE     = 0
 298     CLR_TAG  = 1
 299     WAIT_ACK = 2
 300
 301 #     type reg_internal_t is record
 302 #       -- Cache hit state (Latches for 1 cycle BRAM access)
 303 #       hit_way   : way_t;
 304 #       hit_nia   : std_ulogic_vector(63 downto 0);
 305 #       hit_smark : std_ulogic;
 306 #       hit_valid : std_ulogic;
 307 #
 308 #       -- Cache miss state (reload state machine)
 309 #         state            : state_t;
 310 #         wb               : wishbone_master_out;
 311 #       store_way        : way_t;
 312 #         store_index      : index_t;
 313 #       store_row        : row_t;
 314 #         store_tag        : cache_tag_t;
 315 #         store_valid      : std_ulogic;
 316 #         end_row_ix       : row_in_line_t;
 317 #         rows_valid       : row_per_line_valid_t;
 318 #
 319 #         -- TLB miss state
 320 #         fetch_failed     : std_ulogic;
 321 #     end record;
 322 class RegInternal(RecordObject):
 323     def __init__(self):
 324         super().__init__()
 325         # Cache hit state (Latches for 1 cycle BRAM access)
 326         self.hit_way      = Signal(NUM_WAYS)
 327         self.hit_nia      = Signal(64)
 328         self.hit_smark    = Signal()
 329         self.hit_valid    = Signal()
 330
 331         # Cache miss state (reload state machine)
 332         self.state        = Signal(State)
 333         self.wb           = WBMasterOut()
 334         self.store_way    = Signal(NUM_WAYS)
 335         self.store_index  = Signal(NUM_LINES)
 336         self.store_row    = Signal(BRAM_ROWS)
 337         self.store_tag    = Signal(TAG_BITS)
 338         self.store_valid  = Signal()
 339         self.end_row_ix   = Signal(ROW_LINE_BITS)
 340         self.rows_valid   = RowPerLineValidArray()
 341
 342         # TLB miss state
 343         self.fetch_failed = Signal()
 344
 345 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
 346 #
 347 # entity icache is
 348 #     generic (
 349 #         SIM : boolean := false;
 350 #         -- Line size in bytes
 351 #         LINE_SIZE : positive := 64;
 352 #         -- BRAM organisation: We never access more
 353 #         -- than wishbone_data_bits
 354 #         -- at a time so to save resources we make the
 355 #         -- array only that wide,
 356 #         -- and use consecutive indices for to make a cache "line"
 357 #         --
 358 #         -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
 359 #         -- so 64-bits)
 360 #         ROW_SIZE  : positive := wishbone_data_bits / 8;
 361 #         -- Number of lines in a set
 362 #         NUM_LINES : positive := 32;
 363 #         -- Number of ways
 364 #         NUM_WAYS  : positive := 4;
 365 #         -- L1 ITLB number of entries (direct mapped)
 366 #         TLB_SIZE : positive := 64;
 367 #         -- L1 ITLB log_2(page_size)
 368 #         TLB_LG_PGSZ : positive := 12;
 369 #         -- Number of real address bits that we store
 370 #         REAL_ADDR_BITS : positive := 56;
 371 #         -- Non-zero to enable log data collection
 372 #         LOG_LENGTH : natural := 0
 373 #         );
 374 #     port (
 375 #         clk          : in std_ulogic;
 376 #         rst          : in std_ulogic;
 377 #
 378 #         i_in         : in Fetch1ToIcacheType;
 379 #         i_out        : out IcacheToDecode1Type;
 380 #
 381 #         m_in         : in MmuToIcacheType;
 382 #
 383 #         stall_in     : in std_ulogic;
 384 #       stall_out    : out std_ulogic;
 385 #       flush_in     : in std_ulogic;
 386 #       inval_in     : in std_ulogic;
 387 #
 388 #         wishbone_out : out wishbone_master_out;
 389 #         wishbone_in  : in wishbone_slave_out;
 390 #
 391 #         log_out      : out std_ulogic_vector(53 downto 0)
 392 #         );
 393 # end entity icache;
 394 # 64 bit direct mapped icache. All instructions are 4B aligned.
 395 class ICache(Elaboratable):
 396     """64 bit direct mapped icache. All instructions are 4B aligned."""
 397     def __init__(self):
 398         self.i_in           = Fetch1ToICacheType()
 399         self.i_out          = ICacheToDecode1Type()
 400
 401         self.m_in           = MMUToICacheType()
 402
 403         self.stall_in       = Signal()
 404         self.stall_out      = Signal()
 405         self.flush_in       = Signal()
 406         self.inval_in       = Signal()
 407
 408         self.wb_out         = WBMasterOut()
 409         self.wb_in          = WBSlaveOut()
 410
 411         self.log_out        = Signal(54)
 412
 413 #     -- Return the cache line index (tag index) for an address
 414 #     function get_index(addr: std_ulogic_vector(63 downto 0))
 415 #      return index_t is
 416 #     begin
 417 #         return to_integer(unsigned(
 418 #          addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
 419 #         ));
 420 #     end;
 421     # Return the cache line index (tag index) for an address
 422     def get_index(addr):
 423         return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 424
 425 #     -- Return the cache row index (data memory) for an address
 426 #     function get_row(addr: std_ulogic_vector(63 downto 0))
 427 #       return row_t is
 428 #     begin
 429 #         return to_integer(unsigned(
 430 #          addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
 431 #         ));
 432 #     end;
 433     # Return the cache row index (data memory) for an address
 434     def get_row(addr):
 435         return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 436
 437 #     -- Return the index of a row within a line
 438 #     function get_row_of_line(row: row_t) return row_in_line_t is
 439 #       variable row_v : unsigned(ROW_BITS-1 downto 0);
 440 #     begin
 441 #       row_v := to_unsigned(row, ROW_BITS);
 442 #         return row_v(ROW_LINEBITS-1 downto 0);
 443 #     end;
 444     # Return the index of a row within a line
 445     def get_row_of_line(row):
 446         row[:ROW_LINE_BITS]
 447
 448 #     -- Returns whether this is the last row of a line
 449 #     function is_last_row_addr(addr: wishbone_addr_type;
 450 #      last: row_in_line_t
 451 #     )
 452 #      return boolean is
 453 #     begin
 454 #       return unsigned(
 455 #        addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
 456 #       ) = last;
 457 #     end;
 458     # Returns whether this is the last row of a line
 459     def is_last_row_addr(addr, last):
 460         return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 461
 462 #     -- Returns whether this is the last row of a line
 463 #     function is_last_row(row: row_t;
 464 #      last: row_in_line_t) return boolean is
 465 #     begin
 466 #       return get_row_of_line(row) = last;
 467 #     end;
 468     # Returns whether this is the last row of a line
 469     def is_last_row(row, last):
 470         return get_row_of_line(row) == last
 471
 472 #     -- Return the address of the next row in the current cache line
 473 #     function next_row_addr(addr: wishbone_addr_type)
 474 #       return std_ulogic_vector is
 475 #       variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
 476 #       variable result  : wishbone_addr_type;
 477 #     begin
 478 #       -- Is there no simpler way in VHDL to generate that 3 bits adder ?
 479 #       row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
 480 #       row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
 481 #       result := addr;
 482 #       result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
 483 #       return result;
 484 #     end;
 485     # Return the address of the next row in the current cache line
 486     def next_row_addr(addr):
 487         # TODO no idea what's going on here, looks like double assignments
 488         # overriding earlier assignments ??? Help please!
 489         pass
 490
 491 #     -- Return the next row in the current cache line. We use a dedicated
 492 #     -- function in order to limit the size of the generated adder to be
 493 #     -- only the bits within a cache line (3 bits with default settings)
 494 #     function next_row(row: row_t) return row_t is
 495 #       variable row_v   : std_ulogic_vector(ROW_BITS-1 downto 0);
 496 #       variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
 497 #       variable result  : std_ulogic_vector(ROW_BITS-1 downto 0);
 498 #     begin
 499 #       row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
 500 #       row_idx := row_v(ROW_LINEBITS-1 downto 0);
 501 #       row_v(ROW_LINEBITS-1 downto 0) :=
 502 #        std_ulogic_vector(unsigned(row_idx) + 1);
 503 #       return to_integer(unsigned(row_v));
 504 #     end;
 505     # Return the next row in the current cache line. We use a dedicated
 506     # function in order to limit the size of the generated adder to be
 507     # only the bits within a cache line (3 bits with default settings)
 508     def next_row(row):
 509         # TODO no idea what's going on here, looks like double assignments
 510         # overriding earlier assignments ??? Help please!
 511         pass
 512
 513 #     -- Read the instruction word for the given address in the
 514 #     -- current cache row
 515 #     function read_insn_word(addr: std_ulogic_vector(63 downto 0);
 516 #                           data: cache_row_t) return std_ulogic_vector is
 517 #       variable word: integer range 0 to INSN_PER_ROW-1;
 518 #     begin
 519 #         word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
 520 #       return data(31+word*32 downto word*32);
 521 #     end;
 522     # Read the instruction word for the given address
 523     # in the current cache row
 524     def read_insn_word(addr, data):
 525         word = addr[2:INSN_BITS+3]
 526         return data[word * 32:32 + word * 32]
 527
 528 #     -- Get the tag value from the address
 529 #     function get_tag(
 530 #      addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
 531 #     )
 532 #      return cache_tag_t is
 533 #     begin
 534 #         return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
 535 #     end;
 536     # Get the tag value from the address
 537     def get_tag(addr):
 538         return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 539
 540 #     -- Read a tag from a tag memory row
 541 #     function read_tag(way: way_t; tagset: cache_tags_set_t)
 542 #      return cache_tag_t is
 543 #     begin
 544 #       return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
 545 #     end;
 546     # Read a tag from a tag memory row
 547     def read_tag(way, tagset):
 548         return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
 549
 550 #     -- Write a tag to tag memory row
 551 #     procedure write_tag(way: in way_t;
 552 #      tagset: inout cache_tags_set_t; tag: cache_tag_t) is
 553 #     begin
 554 #       tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
 555 #     end;
 556     # Write a tag to tag memory row
 557     def write_tag(way, tagset, tag):
 558         tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag
 559
 560 #     -- Simple hash for direct-mapped TLB index
 561 #     function hash_ea(addr: std_ulogic_vector(63 downto 0))
 562 #      return tlb_index_t is
 563 #         variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
 564 #     begin
 565 #         hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
 566 #                 xor addr(
 567 #                  TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
 568 #                  TLB_LG_PGSZ + TLB_BITS
 569 #                 )
 570 #                 xor addr(
 571 #                  TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
 572 #                  TLB_LG_PGSZ + 2 * TLB_BITS
 573 #                 );
 574 #         return to_integer(unsigned(hash));
 575 #     end;
 576     # Simple hash for direct-mapped TLB index
 577     def hash_ea(addr):
 578         hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
 579                TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
 580               ] ^ addr[
 581                TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
 582               ]
 583         return hsh
 584
 585 #     -- Generate a cache RAM for each way
 586 #     rams: for i in 0 to NUM_WAYS-1 generate
 587 #       signal do_read  : std_ulogic;
 588 #       signal do_write : std_ulogic;
 589 #       signal rd_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
 590 #       signal wr_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
 591 #       signal dout     : cache_row_t;
 592 #       signal wr_sel   : std_ulogic_vector(ROW_SIZE-1 downto 0);
 593 #     begin
 594 #       way: entity work.cache_ram
 595 #           generic map (
 596 #               ROW_BITS => ROW_BITS,
 597 #               WIDTH => ROW_SIZE_BITS
 598 #               )
 599 #           port map (
 600 #               clk     => clk,
 601 #               rd_en   => do_read,
 602 #               rd_addr => rd_addr,
 603 #               rd_data => dout,
 604 #               wr_sel  => wr_sel,
 605 #               wr_addr => wr_addr,
 606 #               wr_data => wishbone_in.dat
 607 #               );
 608 #       process(all)
 609 #       begin
 610 #           do_read <= not (stall_in or use_previous);
 611 #           do_write <= '0';
 612 #           if wishbone_in.ack = '1' and replace_way = i then
 613 #               do_write <= '1';
 614 #           end if;
 615 #           cache_out(i) <= dout;
 616 #           rd_addr <=
 617 #            std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
 618 #           wr_addr <=
 619 #            std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
 620 #             for i in 0 to ROW_SIZE-1 loop
 621 #                 wr_sel(i) <= do_write;
 622 #             end loop;
 623 #       end process;
 624 #     end generate;
 625     def rams(self, m):
 626         comb = m.d.comb
 627
 628         do_read  = Signal()
 629         do_write = Signal()
 630         rd_addr  = Signal(ROW_BITS)
 631         wr_addr  = Signal(ROW_BITS)
 632         _d_out   = Signal(ROW_SIZE_BITS)
 633         wr_sel   = Signal(ROW_SIZE)
 634
 635         for i in range(NUM_WAYS):
 636             way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
 637             comb += way.rd_en.eq(do_read)
 638             comb += way.rd_addr.eq(rd_addr)
 639             comb += way.rd_data.eq(_d_out)
 640             comb += way.wr_sel.eq(wr_sel)
 641             comb += way.wr_add.eq(wr_addr)
 642             comb += way.wr_data.eq(wb_in.dat)
 643
 644             comb += do_read.eq(~(stall_in | use_previous))
 645             comb += do_write.eq(0)
 646
 647             with m.If(wb_in.ack & (replace_way == i)):
 648                 do_write.eq(1)
 649
 650             comb += cache_out[i].eq(_d_out)
 651             comb += rd_addr.eq(Signal(req_row))
 652             comb += wr_addr.eq(Signal(r.store_row))
 653             for j in range(ROW_SIZE):
 654                 comb += wr_sel[j].eq(do_write)
 655
 656 #     -- Generate PLRUs
 657 #     maybe_plrus: if NUM_WAYS > 1 generate
 658 #     begin
 659 #       plrus: for i in 0 to NUM_LINES-1 generate
 660 #           -- PLRU interface
 661 #           signal plru_acc    : std_ulogic_vector(WAY_BITS-1 downto 0);
 662 #           signal plru_acc_en : std_ulogic;
 663 #           signal plru_out    : std_ulogic_vector(WAY_BITS-1 downto 0);
 664 #
 665 #       begin
 666 #           plru : entity work.plru
 667 #               generic map (
 668 #                   BITS => WAY_BITS
 669 #                   )
 670 #               port map (
 671 #                   clk => clk,
 672 #                   rst => rst,
 673 #                   acc => plru_acc,
 674 #                   acc_en => plru_acc_en,
 675 #                   lru => plru_out
 676 #                   );
 677 #
 678 #           process(all)
 679 #           begin
 680 #               -- PLRU interface
 681 #               if get_index(r.hit_nia) = i then
 682 #                   plru_acc_en <= r.hit_valid;
 683 #               else
 684 #                   plru_acc_en <= '0';
 685 #               end if;
 686 #               plru_acc <=
 687 #                std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
 688 #               plru_victim(i) <= plru_out;
 689 #           end process;
 690 #       end generate;
 691 #     end generate;
 692     def maybe_plrus(self, m):
 693         comb += m.d.comb
 694
 695         with m.If(NUM_WAYS > 1):
 696             for i in range(NUM_LINES):
 697                 plru_acc    = Signal(WAY_BITS)
 698                 plru_acc_en = Signal()
 699                 plru_out    = Signal(WAY_BITS)
 700                 plru        = PLRU(WAY_BITS)
 701                 comb += plru.acc.eq(plru_acc)
 702                 comb += plru.acc_en.eq(plru_acc_en)
 703                 comb += plru.lru.eq(plru_out)
 704
 705                 # PLRU interface
 706                 with m.If(get_index(r.hit_nia) == i):
 707                     comb += plru.acc_en.eq(r.hit_valid)
 708
 709                 with m.Else():
 710                     comb += plru.acc_en.eq(0)
 711
 712                 comb += plru.acc.eq(r.hit_way)
 713                 comb += plru_victim[i].eq(plru.lru)
 714
 715 #     -- TLB hit detection and real address generation
 716 #     itlb_lookup : process(all)
 717 #         variable pte : tlb_pte_t;
 718 #         variable ttag : tlb_tag_t;
 719 #     begin
 720 #         tlb_req_index <= hash_ea(i_in.nia);
 721 #         pte := itlb_ptes(tlb_req_index);
 722 #         ttag := itlb_tags(tlb_req_index);
 723 #         if i_in.virt_mode = '1' then
 724 #             real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
 725 #                          i_in.nia(TLB_LG_PGSZ - 1 downto 0);
 726 #             if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
 727 #                 ra_valid <= itlb_valids(tlb_req_index);
 728 #             else
 729 #                 ra_valid <= '0';
 730 #             end if;
 731 #             eaa_priv <= pte(3);
 732 #         else
 733 #             real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
 734 #             ra_valid <= '1';
 735 #             eaa_priv <= '1';
 736 #         end if;
 737 #
 738 #         -- no IAMR, so no KUEP support for now
 739 #         priv_fault <= eaa_priv and not i_in.priv_mode;
 740 #         access_ok <= ra_valid and not priv_fault;
 741 #     end process;
 742     # TLB hit detection and real address generation
 743     def itlb_lookup(self, m):
 744         comb = m.d.comb
 745
 746         comb += tlb_req_index.eq(hash_ea(i_in.nia))
 747         comb += pte.eq(itlb_ptes[tlb_req_index])
 748         comb += ttag.eq(itlb_tags[tlb_req_index])
 749
 750         with m.If(i_in.virt_mode):
 751             comb += real_addr.eq(Cat(
 752                      i_in.nia[:TLB_LB_PGSZ],
 753                      pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
 754                     ))
 755
 756             with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
 757                 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
 758
 759             with m.Else():
 760                 comb += ra_valid.eq(0)
 761
 762         with m.Else():
 763             comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
 764             comb += ra_valid.eq(1)
 765             comb += eaa_priv.eq(1)
 766
 767         # No IAMR, so no KUEP support for now
 768         comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
 769         comb += access_ok.eq(ra_valid & ~priv_fault)
 770
 771 #     -- iTLB update
 772 #     itlb_update: process(clk)
 773 #         variable wr_index : tlb_index_t;
 774 #     begin
 775 #         if rising_edge(clk) then
 776 #             wr_index := hash_ea(m_in.addr);
 777 #             if rst = '1' or
 778 #              (m_in.tlbie = '1' and m_in.doall = '1') then
 779 #                 -- clear all valid bits
 780 #                 for i in tlb_index_t loop
 781 #                     itlb_valids(i) <= '0';
 782 #                 end loop;
 783 #             elsif m_in.tlbie = '1' then
 784 #                 -- clear entry regardless of hit or miss
 785 #                 itlb_valids(wr_index) <= '0';
 786 #             elsif m_in.tlbld = '1' then
 787 #                 itlb_tags(wr_index) <=
 788 #                  m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
 789 #                 itlb_ptes(wr_index) <= m_in.pte;
 790 #                 itlb_valids(wr_index) <= '1';
 791 #             end if;
 792 #         end if;
 793 #     end process;
 794     # iTLB update
 795     def itlb_update(self, m):
 796         sync = m.d.sync
 797
 798         wr_index = Signal(TLB_SIZE)
 799         sync += wr_index.eq(hash_ea(m_in.addr))
 800
 801         with m.If('''TODO rst in nmigen''' | (m_in.tlbie & m_in.doall)):
 802             # Clear all valid bits
 803             for i in range(TLB_SIZE):
 804                 sync += itlb_vlaids[i].eq(0)
 805
 806         with m.Elif(m_in.tlbie):
 807             # Clear entry regardless of hit or miss
 808             sync += itlb_valid_bits[wr_index].eq(0)
 809
 810         with m.Elif(m_in.tlbld):
 811             sync += itlb_tags[wr_index].eq(
 812                      m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
 813                     )
 814             sync += itlb_ptes[wr_index].eq(m_in.pte)
 815             sync += itlb_valid_bits[wr_index].eq(1)
 816
 817 #     -- Cache hit detection, output to fetch2 and other misc logic
 818 #     icache_comb : process(all)
 819     # Cache hit detection, output to fetch2 and other misc logic
 820     def icache_comb(self, m):
 821 #       variable is_hit  : std_ulogic;
 822 #       variable hit_way : way_t;
 823         comb = m.d.comb
 824
 825         is_hit  = Signal()
 826         hit_way = Signal(NUM_WAYS)
 827 #     begin
 828 #         -- i_in.sequential means that i_in.nia this cycle
 829 #         -- is 4 more than last cycle.  If we read more
 830 #         -- than 32 bits at a time, had a cache hit last
 831 #         -- cycle, and we don't want the first 32-bit chunk
 832 #         -- then we can keep the data we read last cycle
 833 #         -- and just use that.
 834 #         if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
 835 #             use_previous <= i_in.sequential and r.hit_valid;
 836 #         else
 837 #             use_previous <= '0';
 838 #         end if;
 839         # i_in.sequential means that i_in.nia this cycle is 4 more than
 840         # last cycle.  If we read more than 32 bits at a time, had a
 841         # cache hit last cycle, and we don't want the first 32-bit chunk
 842         # then we can keep the data we read last cycle and just use that.
 843         with m.If(i_in.nia[2:INSN_BITS+2] != 0):
 844             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 845
 846         with m.Else():
 847             comb += use_previous.eq(0)
 848
 849 #       -- Extract line, row and tag from request
 850 #         req_index <= get_index(i_in.nia);
 851 #         req_row <= get_row(i_in.nia);
 852 #         req_tag <= get_tag(real_addr);
 853         # Extract line, row and tag from request
 854         comb += req_index.eq(get_index(i_in.nia))
 855         comb += req_row.eq(get_row(i_in.nia))
 856         comb += req_tag.eq(get_tag(real_addr))
 857
 858 #       -- Calculate address of beginning of cache row, will be
 859 #       -- used for cache miss processing if needed
 860 #       req_laddr <=
 861 #        (63 downto REAL_ADDR_BITS => '0') &
 862 #        real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
 863 #        (ROW_OFF_BITS-1 downto 0 => '0');
 864         # Calculate address of beginning of cache row, will be
 865         # used for cache miss processing if needed
 866         comb += req_laddr.eq(Cat(
 867                  Const(0b0, ROW_OFF_BITS),
 868                  real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
 869                  Const(0, REAL_ADDR_BITS)
 870                 ))
 871
 872 #       -- Test if pending request is a hit on any way
 873 #       hit_way := 0;
 874 #       is_hit := '0';
 875 #       for i in way_t loop
 876 #           if i_in.req = '1' and
 877 #                 (cache_valids(req_index)(i) = '1' or
 878 #                  (r.state = WAIT_ACK and
 879 #                   req_index = r.store_index and
 880 #                   i = r.store_way and
 881 #                   r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
 882 #               if read_tag(i, cache_tags(req_index)) = req_tag then
 883 #                   hit_way := i;
 884 #                   is_hit := '1';
 885 #               end if;
 886 #           end if;
 887 #       end loop;
 888         # Test if pending request is a hit on any way
 889         for i in range(NUM_WAYS):
 890             with m.If(i_in.req &
 891                       (cache_valid_bits[req_index][i] |
 892                        ((r.state == State.WAIT_ACK)
 893                         & (req_index == r.store_index)
 894                         & (i == r.store_way)
 895                         & r.rows_valid[req_row % ROW_PER_LINE]))):
 896                 with m.If(read_tag(i, cahce_tags[req_index]) == req_tag):
 897                     comb += hit_way.eq(i)
 898                     comb += is_hit.eq(1)
 899
 900 #       -- Generate the "hit" and "miss" signals
 901 #       -- for the synchronous blocks
 902 #       if i_in.req = '1' and access_ok = '1' and flush_in = '0'
 903 #        and rst = '0' then
 904 #           req_is_hit  <= is_hit;
 905 #           req_is_miss <= not is_hit;
 906 #       else
 907 #           req_is_hit  <= '0';
 908 #           req_is_miss <= '0';
 909 #       end if;
 910 #       req_hit_way <= hit_way;
 911         # Generate the "hit" and "miss" signals
 912         # for the synchronous blocks
 913         with m.If(i_in.rq & access_ok & ~flush_in):
 914             comb += req_is_hit.eq(is_hit)
 915             comb += req_is_miss.eq(~is_hit)
 916
 917         with m.Else():
 918             comb += req_is_hit.eq(0)
 919             comb += req_is_miss.eq(0)
 920
 921 #       -- The way to replace on a miss
 922 #       if r.state = CLR_TAG then
 923 #           replace_way <=
 924 #            to_integer(unsigned(plru_victim(r.store_index)));
 925 #       else
 926 #           replace_way <= r.store_way;
 927 #       end if;
 928         # The way to replace on a miss
 929         with m.If(r.state == State.CLR_TAG):
 930             comb += replace_way.eq(plru_victim[r.store_index])
 931
 932         with m.Else():
 933             comb += replace_way.eq(r.store_way)
 934
 935 #       -- Output instruction from current cache row
 936 #       --
 937 #       -- Note: This is a mild violation of our design principle of
 938 #       -- having pipeline stages output from a clean latch. In this
 939 #       -- case we output the result of a mux. The alternative would
 940 #       -- be output an entire row which I prefer not to do just yet
 941 #       -- as it would force fetch2 to know about some of the cache
 942 #       -- geometry information.
 943 #       i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
 944 #       i_out.valid <= r.hit_valid;
 945 #       i_out.nia <= r.hit_nia;
 946 #       i_out.stop_mark <= r.hit_smark;
 947 #       i_out.fetch_failed <= r.fetch_failed;
 948         # Output instruction from current cache row
 949         #
 950         # Note: This is a mild violation of our design principle of
 951         # having pipeline stages output from a clean latch. In this
 952         # case we output the result of a mux. The alternative would
 953         # be output an entire row which I prefer not to do just yet
 954         # as it would force fetch2 to know about some of the cache
 955         # geometry information.
 956         comb += i_out.insn.eq(
 957                  read_insn_word(r.hit_nia, cache_out[r.hit_way])
 958                 )
 959         comb += i_out.valid.eq(r.hit_valid)
 960         comb += i_out.nia.eq(r.hit_nia)
 961         comb += i_out.stop_mark.eq(r.hit_smark)
 962         comb += i_out.fetch_failed.eq(r.fetch_failed)
 963
 964 #       -- Stall fetch1 if we have a miss on cache or TLB
 965 #       -- or a protection fault
 966 #       stall_out <= not (is_hit and access_ok);
 967         # Stall fetch1 if we have a miss on cache or TLB
 968         # or a protection fault
 969         comb += stall_out.eq(~(is_hit & access_ok))
 970
 971 #       -- Wishbone requests output (from the cache miss reload machine)
 972 #       wishbone_out <= r.wb;
 973         # Wishbone requests output (from the cache miss reload machine)
 974         comb += wb_out.eq(r.wb)
 975 #     end process;
 976
 977 #     -- Cache hit synchronous machine
 978 #     icache_hit : process(clk)
 979     # Cache hit synchronous machine
 980     def icache_hit(self, m):
 981         sync = m.d.sync
 982 #     begin
 983 #         if rising_edge(clk) then
 984 #             -- keep outputs to fetch2 unchanged on a stall
 985 #             -- except that flush or reset sets valid to 0
 986 #             -- If use_previous, keep the same data as last
 987 #             -- cycle and use the second half
 988 #             if stall_in = '1' or use_previous = '1' then
 989 #                 if rst = '1' or flush_in = '1' then
 990 #                     r.hit_valid <= '0';
 991 #             end if;
 992         # keep outputs to fetch2 unchanged on a stall
 993         # except that flush or reset sets valid to 0
 994         # If use_previous, keep the same data as last
 995         # cycle and use the second half
 996         with m.If(stall_in | use_previous):
 997             with m.If('''TODO rst nmigen''' | flush_in):
 998                 sync += r.hit_valid.eq(0)
 999 #             else
1000 #                 -- On a hit, latch the request for the next cycle,
1001 #                 -- when the BRAM data will be available on the
1002 #                 -- cache_out output of the corresponding way
1003 #                 r.hit_valid <= req_is_hit;
1004 #                 if req_is_hit = '1' then
1005 #                     r.hit_way <= req_hit_way;
1006         with m.Else():
1007             # On a hit, latch the request for the next cycle,
1008             # when the BRAM data will be available on the
1009             # cache_out output of the corresponding way
1010             sync += r.hit_valid.eq(req_is_hit)
1011
1012             with m.If(req_is_hit):
1013                 sync += r.hit_way.eq(req_hit_way)
1014
1015 #                     report "cache hit nia:" & to_hstring(i_in.nia) &
1016 #                         " IR:" & std_ulogic'image(i_in.virt_mode) &
1017 #                         " SM:" & std_ulogic'image(i_in.stop_mark) &
1018 #                         " idx:" & integer'image(req_index) &
1019 #                         " tag:" & to_hstring(req_tag) &
1020 #                         " way:" & integer'image(req_hit_way) &
1021 #                         " RA:" & to_hstring(real_addr);
1022                 print(f"cache hit nia:{i_in.nia}, " \
1023                       f"IR:{i_in.virt_mode}, " \
1024                       f"SM:{i_in.stop_mark}, idx:{req_index}, " \
1025                       f"tag:{req_tag}, way:{req_hit_way}, " \
1026                       f"RA:{real_addr}")
1027 #                 end if;
1028 #           end if;
1029 #             if stall_in = '0' then
1030 #                 -- Send stop marks and NIA down regardless of validity
1031 #                 r.hit_smark <= i_in.stop_mark;
1032 #                 r.hit_nia <= i_in.nia;
1033 #             end if;
1034         with m.If(~stall_in):
1035             # Send stop marks and NIA down regardless of validity
1036             sync += r.hit_smark.eq(i_in.stop_mark)
1037             sync += r.hit_nia.eq(i_in.nia)
1038 #       end if;
1039 #     end process;
1040
1041 #     -- Cache miss/reload synchronous machine
1042 #     icache_miss : process(clk)
1043     # Cache miss/reload synchronous machine
1044     def icache_miss(self, m):
1045         comb = m.d.comb
1046         sync = m.d.sync
1047
1048 #       variable tagset    : cache_tags_set_t;
1049 #       variable stbs_done : boolean;
1050
1051         tagset    = Signal(TAG_RAM_WIDTH)
1052         stbs_done = Signal()
1053
1054 #     begin
1055 #         if rising_edge(clk) then
1056 #           -- On reset, clear all valid bits to force misses
1057 #             if rst = '1' then
1058         # On reset, clear all valid bits to force misses
1059         with m.If('''TODO rst nmigen'''):
1060 #               for i in index_t loop
1061 #                   cache_valids(i) <= (others => '0');
1062 #               end loop;
1063             for i in Signal(NUM_LINES):
1064                 sync += cache_valid_bits[i].eq(~1)
1065
1066 #                 r.state <= IDLE;
1067 #                 r.wb.cyc <= '0';
1068 #                 r.wb.stb <= '0';
1069             sync += r.state.eq(State.IDLE)
1070             sync += r.wb.cyc.eq(0)
1071             sync += r.wb.stb.eq(0)
1072
1073 #               -- We only ever do reads on wishbone
1074 #               r.wb.dat <= (others => '0');
1075 #               r.wb.sel <= "11111111";
1076 #               r.wb.we  <= '0';
1077             # We only ever do reads on wishbone
1078             sync += r.wb.dat.eq(~1)
1079             sync += r.wb.sel.eq(Const(0b11111111, 8))
1080             sync += r.wb.we.eq(0)
1081
1082 #               -- Not useful normally but helps avoiding
1083 #               -- tons of sim warnings
1084 #               r.wb.adr <= (others => '0');
1085             # Not useful normally but helps avoiding tons of sim warnings
1086             sync += r.wb.adr.eq(~1)
1087
1088 #             else
1089         with m.Else():
1090 #                 -- Process cache invalidations
1091 #                 if inval_in = '1' then
1092 #                     for i in index_t loop
1093 #                         cache_valids(i) <= (others => '0');
1094 #                     end loop;
1095 #                     r.store_valid <= '0';
1096 #                 end if;
1097             # Process cache invalidations
1098             with m.If(inval_in):
1099                 for i in range(NUM_LINES):
1100                     sync += cache_valid_bits[i].eq(~1)
1101
1102                 sync += r.store_valid.eq(0)
1103
1104 #               -- Main state machine
1105 #               case r.state is
1106                 # Main state machine
1107                 with m.Switch(r.state):
1108
1109 #               when IDLE =>
1110                     with m.Case(State.IDLE):
1111 #                     -- Reset per-row valid flags,
1112 #                     -- only used in WAIT_ACK
1113 #                     for i in 0 to ROW_PER_LINE - 1 loop
1114 #                         r.rows_valid(i) <= '0';
1115 #                     end loop;
1116                         # Reset per-row valid flags,
1117                         # only used in WAIT_ACK
1118                         for i in range(ROW_PER_LINE):
1119                             sync += r.rows_valid[i].eq(0)
1120
1121 #                   -- We need to read a cache line
1122 #                   if req_is_miss = '1' then
1123 #                       report "cache miss nia:" & to_hstring(i_in.nia) &
1124 #                             " IR:" & std_ulogic'image(i_in.virt_mode) &
1125 #                           " SM:" & std_ulogic'image(i_in.stop_mark) &
1126 #                           " idx:" & integer'image(req_index) &
1127 #                           " way:" & integer'image(replace_way) &
1128 #                           " tag:" & to_hstring(req_tag) &
1129 #                             " RA:" & to_hstring(real_addr);
1130                         # We need to read a cache line
1131                         with m.If(req_is_miss):
1132                             print(f"cache miss nia:{i_in.nia} " \
1133                                   f"IR:{i_in.virt_mode} " \
1134                                   f"SM:{i_in.stop_mark} " \
1135                                   F"idx:{req_index} " \
1136                                   f"way:{replace_way} tag:{req_tag} " \
1137                                   f"RA:{real_addr}")
1138
1139 #                       -- Keep track of our index and way for
1140 #                       -- subsequent stores
1141 #                       r.store_index <= req_index;
1142 #                       r.store_row <= get_row(req_laddr);
1143 #                       r.store_tag <= req_tag;
1144 #                       r.store_valid <= '1';
1145 #                       r.end_row_ix <=
1146 #                        get_row_of_line(get_row(req_laddr)) - 1;
1147                             # Keep track of our index and way
1148                             # for subsequent stores
1149                             sync += r.store_index.eq(req_index)
1150                             sync += r.store_row.eq(get_row(req_laddr))
1151                             sync += r.store_tag.eq(req_tag)
1152                             sync += r.store_valid.eq(1)
1153                             sync += r.end_row_ix.eq(
1154                                      get_row_of_line(
1155                                       get_row(req_laddr)
1156                                      ) - 1
1157                                     )
1158
1159 #                       -- Prep for first wishbone read. We calculate the
1160 #                       -- address of the start of the cache line and
1161 #                       -- start the WB cycle.
1162 #                       r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1163 #                       r.wb.cyc <= '1';
1164 #                       r.wb.stb <= '1';
1165                             # Prep for first wishbone read.
1166                             # We calculate the
1167                             # address of the start of the cache line and
1168                             # start the WB cycle.
1169                             sync += r.wb.adr.eq(
1170                                      req_laddr[:r.wb.adr]
1171                                     )
1172
1173 #                       -- Track that we had one request sent
1174 #                       r.state <= CLR_TAG;
1175                             # Track that we had one request sent
1176                             sync += r.state.eq(State.CLR_TAG)
1177 #                   end if;
1178
1179 #               when CLR_TAG | WAIT_ACK =>
1180                     with m.Case(State.CLR_TAG, State.WAIT_ACK):
1181 #                     if r.state = CLR_TAG then
1182                         with m.If(r.state == State.CLR_TAG):
1183 #                         -- Get victim way from plru
1184 #                       r.store_way <= replace_way;
1185                             # Get victim way from plru
1186                             sync += r.store_way.eq(replace_way)
1187 #
1188 #                       -- Force misses on that way while
1189 #                       -- reloading that line
1190 #                       cache_valids(req_index)(replace_way) <= '0';
1191                             # Force misses on that way while
1192                             # realoading that line
1193                             sync += cache_valid_bits[
1194                                      req_index
1195                                     ][replace_way].eq(0)
1196
1197 #                       -- Store new tag in selected way
1198 #                       for i in 0 to NUM_WAYS-1 loop
1199 #                           if i = replace_way then
1200 #                               tagset := cache_tags(r.store_index);
1201 #                               write_tag(i, tagset, r.store_tag);
1202 #                               cache_tags(r.store_index) <= tagset;
1203 #                           end if;
1204 #                       end loop;
1205                             for i in range(NUM_WAYS):
1206                                 with m.If(i == replace_way):
1207                                     comb += tagset.eq(
1208                                              cache_tags[r.store_index]
1209                                             )
1210                                     sync += write_tag(
1211                                              i, tagset, r.store_tag
1212                                             )
1213                                     sync += cache_tags(r.store_index).eq(
1214                                              tagset
1215                                             )
1216
1217 #                         r.state <= WAIT_ACK;
1218                             sync += r.state.eq(State.WAIT_ACK)
1219 #                     end if;
1220
1221 #                   -- Requests are all sent if stb is 0
1222 #                   stbs_done := r.wb.stb = '0';
1223                         # Requests are all sent if stb is 0
1224                         comb += stbs_done.eq(r.wb.stb == 0)
1225
1226 #                   -- If we are still sending requests,
1227 #                   -- was one accepted ?
1228 #                   if wishbone_in.stall = '0' and not stbs_done then
1229                         # If we are still sending requests,
1230                         # was one accepted?
1231                         with m.If(~wb_in.stall & ~stbs_done):
1232 #                       -- That was the last word ? We are done sending.
1233 #                       -- Clear stb and set stbs_done so we can handle
1234 #                       -- an eventual last ack on the same cycle.
1235 #                       if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1236 #                           r.wb.stb <= '0';
1237 #                           stbs_done := true;
1238 #                       end if;
1239                             # That was the last word ?
1240                             # We are done sending.
1241                             # Clear stb and set stbs_done
1242                             # so we can handle
1243                             # an eventual last ack on
1244                             # the same cycle.
1245                             with m.If(is_last_row_addr(
1246                                       r.wb.adr, r.end_row_ix)):
1247                                 sync += r.wb.stb.eq(0)
1248                                 stbs_done.eq(1)
1249
1250 #                       -- Calculate the next row address
1251 #                       r.wb.adr <= next_row_addr(r.wb.adr);
1252                             # Calculate the next row address
1253                             sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
1254 #                   end if;
1255
1256 #                   -- Incoming acks processing
1257 #                   if wishbone_in.ack = '1' then
1258                         # Incoming acks processing
1259                         with m.If(wb_in.ack):
1260 #                         r.rows_valid(r.store_row mod ROW_PER_LINE)
1261 #                          <= '1';
1262                             sync += r.rows_valid[
1263                                      r.store_row & ROW_PER_LINE
1264                                     ].eq(1)
1265
1266 #                       -- Check for completion
1267 #                       if stbs_done and
1268 #                        is_last_row(r.store_row, r.end_row_ix) then
1269                             # Check for completion
1270                             with m.If(stbs_done & is_last_row(
1271                                       r.store_row, r.end_row_ix)):
1272 #                           -- Complete wishbone cycle
1273 #                           r.wb.cyc <= '0';
1274                                 # Complete wishbone cycle
1275                                 sync += r.wb.cyc.eq(0)
1276
1277 #                           -- Cache line is now valid
1278 #                           cache_valids(r.store_index)(replace_way) <=
1279 #                            r.store_valid and not inval_in;
1280                                 # Cache line is now valid
1281                                 sync += cache_valid_bits[
1282                                          r.store_index
1283                                         ][relace_way].eq(
1284                                          r.store_valid & ~inval_in
1285                                         )
1286
1287 #                           -- We are done
1288 #                           r.state <= IDLE;
1289                                 # We are done
1290                                 sync += r.state.eq(State.IDLE)
1291 #                       end if;
1292
1293 #                       -- Increment store row counter
1294 #                       r.store_row <= next_row(r.store_row);
1295                             # Increment store row counter
1296                             sync += store_row.eq(next_row(r.store_row))
1297 #                   end if;
1298 #               end case;
1299 #           end if;
1300 #
1301 #             -- TLB miss and protection fault processing
1302 #             if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1303 #                 r.fetch_failed <= '0';
1304 #             elsif i_in.req = '1' and access_ok = '0' and
1305 #              stall_in = '0' then
1306 #                 r.fetch_failed <= '1';
1307 #             end if;
1308             # TLB miss and protection fault processing
1309             with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
1310                 sync += r.fetch_failed.eq(0)
1311
1312             with m.Elif(i_in.req & ~access_ok & ~stall_in):
1313                 sync += r.fetch_failed.eq(1)
1314 #       end if;
1315 #     end process;
1316
1317 #     icache_log: if LOG_LENGTH > 0 generate
1318     def icache_log(self, m, log_out):
1319         comb = m.d.comb
1320         sync = m.d.sync
1321
1322 #         -- Output data to logger
1323 #         signal log_data    : std_ulogic_vector(53 downto 0);
1324 #     begin
1325 #         data_log: process(clk)
1326 #             variable lway: way_t;
1327 #             variable wstate: std_ulogic;
1328         # Output data to logger
1329         for i in range(LOG_LENGTH):
1330             # Output data to logger
1331             log_data = Signal(54)
1332             lway     = Signal(NUM_WAYS)
1333             wstate   = Signal()
1334
1335 #         begin
1336 #             if rising_edge(clk) then
1337 #                 lway := req_hit_way;
1338 #                 wstate := '0';
1339             comb += lway.eq(req_hit_way)
1340             comb += wstate.eq(0)
1341
1342 #                 if r.state /= IDLE then
1343 #                     wstate := '1';
1344 #                 end if;
1345             with m.If(r.state != State.IDLE):
1346                 comb += wstate.eq(1)
1347
1348 #                 log_data <= i_out.valid &
1349 #                             i_out.insn &
1350 #                             wishbone_in.ack &
1351 #                             r.wb.adr(5 downto 3) &
1352 #                             r.wb.stb & r.wb.cyc &
1353 #                             wishbone_in.stall &
1354 #                             stall_out &
1355 #                             r.fetch_failed &
1356 #                             r.hit_nia(5 downto 2) &
1357 #                             wstate &
1358 #                             std_ulogic_vector(to_unsigned(lway, 3)) &
1359 #                             req_is_hit & req_is_miss &
1360 #                             access_ok &
1361 #                             ra_valid;
1362             sync += log_data.eq(Cat(
1363                      ra_valid, access_ok, req_is_miss, req_is_hit,
1364                      lway, wstate, r.hit_nia[2:6],
1365                      r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1366                      r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1367                      i_out.valid
1368                     ))
1369 #             end if;
1370 #         end process;
1371 #         log_out <= log_data;
1372             comb += log_out.eq(log_data)
1373 #     end generate;
1374 # end;
1375
1376     def elaborate(self, platform):
1377
1378         m                = Module()
1379         comb             = m.d.comb
1380
1381         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1382         cache_tags       = CacheTagArray()
1383         cache_valid_bits = CacheValidBitsArray()
1384
1385 #     signal itlb_valids : tlb_valids_t;
1386 #     signal itlb_tags : tlb_tags_t;
1387 #     signal itlb_ptes : tlb_ptes_t;
1388 #     attribute ram_style of itlb_tags : signal is "distributed";
1389 #     attribute ram_style of itlb_ptes : signal is "distributed";
1390         itlb_valid_bits  = TLBValidBitsArray()
1391         itlb_tags        = TLBTagArray()
1392         itlb_ptes        = TLBPTEArray()
1393         # TODO to be passed to nmigen as ram attributes
1394         # attribute ram_style of itlb_tags : signal is "distributed";
1395         # attribute ram_style of itlb_ptes : signal is "distributed";
1396
1397 #     -- Privilege bit from PTE EAA field
1398 #     signal eaa_priv  : std_ulogic;
1399         # Privilege bit from PTE EAA field
1400         eaa_priv         = Signal()
1401
1402 #     signal r : reg_internal_t;
1403         r                = RegInternal()
1404
1405 #     -- Async signals on incoming request
1406 #     signal req_index   : index_t;
1407 #     signal req_row     : row_t;
1408 #     signal req_hit_way : way_t;
1409 #     signal req_tag     : cache_tag_t;
1410 #     signal req_is_hit  : std_ulogic;
1411 #     signal req_is_miss : std_ulogic;
1412 #     signal req_laddr   : std_ulogic_vector(63 downto 0);
1413         # Async signal on incoming request
1414         req_index        = Signal(NUM_LINES)
1415         req_row          = Signal(BRAM_ROWS)
1416         req_hit_way      = Signal(NUM_WAYS)
1417         req_tag          = Signal(TAG_BITS)
1418         req_is_hit       = Signal()
1419         req_is_miss      = Signal()
1420         req_laddr        = Signal(64)
1421
1422 #     signal tlb_req_index : tlb_index_t;
1423 #     signal real_addr     : std_ulogic_vector(
1424 #                             REAL_ADDR_BITS - 1 downto 0
1425 #                            );
1426 #     signal ra_valid      : std_ulogic;
1427 #     signal priv_fault    : std_ulogic;
1428 #     signal access_ok     : std_ulogic;
1429 #     signal use_previous  : std_ulogic;
1430         tlb_req_index    = Signal(TLB_SIZE)
1431         real_addr        = Signal(REAL_ADDR_BITS)
1432         ra_valid         = Signal()
1433         priv_fault       = Signal()
1434         access_ok        = Signal()
1435         use_previous     = Signal()
1436
1437 #     signal cache_out   : cache_ram_out_t;
1438         cache_out        = CacheRamOut()
1439
1440 #     signal plru_victim : plru_out_t;
1441 #     signal replace_way : way_t;
1442         plru_victim      = PLRUOut()
1443         replace_way      = Signal(NUM_WAYS)
1444
1445         return m
1446
1447
1448 # icache_tb.vhdl
1449 #
1450 # library ieee;
1451 # use ieee.std_logic_1164.all;
1452 #
1453 # library work;
1454 # use work.common.all;
1455 # use work.wishbone_types.all;
1456 #
1457 # entity icache_tb is
1458 # end icache_tb;
1459 #
1460 # architecture behave of icache_tb is
1461 #     signal clk          : std_ulogic;
1462 #     signal rst          : std_ulogic;
1463 #
1464 #     signal i_out        : Fetch1ToIcacheType;
1465 #     signal i_in         : IcacheToDecode1Type;
1466 #
1467 #     signal m_out        : MmuToIcacheType;
1468 #
1469 #     signal wb_bram_in   : wishbone_master_out;
1470 #     signal wb_bram_out  : wishbone_slave_out;
1471 #
1472 #     constant clk_period : time := 10 ns;
1473 # begin
1474 #     icache0: entity work.icache
1475 #         generic map(
1476 #             LINE_SIZE => 64,
1477 #             NUM_LINES => 4
1478 #             )
1479 #         port map(
1480 #             clk => clk,
1481 #             rst => rst,
1482 #             i_in => i_out,
1483 #             i_out => i_in,
1484 #             m_in => m_out,
1485 #             stall_in => '0',
1486 #           flush_in => '0',
1487 #             inval_in => '0',
1488 #             wishbone_out => wb_bram_in,
1489 #             wishbone_in => wb_bram_out
1490 #             );
1491 #
1492 #     -- BRAM Memory slave
1493 #     bram0: entity work.wishbone_bram_wrapper
1494 #         generic map(
1495 #             MEMORY_SIZE   => 1024,
1496 #             RAM_INIT_FILE => "icache_test.bin"
1497 #             )
1498 #         port map(
1499 #             clk => clk,
1500 #             rst => rst,
1501 #             wishbone_in => wb_bram_in,
1502 #             wishbone_out => wb_bram_out
1503 #             );
1504 #
1505 #     clk_process: process
1506 #     begin
1507 #         clk <= '0';
1508 #         wait for clk_period/2;
1509 #         clk <= '1';
1510 #         wait for clk_period/2;
1511 #     end process;
1512 #
1513 #     rst_process: process
1514 #     begin
1515 #         rst <= '1';
1516 #         wait for 2*clk_period;
1517 #         rst <= '0';
1518 #         wait;
1519 #     end process;
1520 #
1521 #     stim: process
1522 #     begin
1523 #         i_out.req <= '0';
1524 #         i_out.nia <= (others => '0');
1525 #       i_out.stop_mark <= '0';
1526 #
1527 #         m_out.tlbld <= '0';
1528 #         m_out.tlbie <= '0';
1529 #         m_out.addr <= (others => '0');
1530 #         m_out.pte <= (others => '0');
1531 #
1532 #         wait until rising_edge(clk);
1533 #         wait until rising_edge(clk);
1534 #         wait until rising_edge(clk);
1535 #         wait until rising_edge(clk);
1536 #
1537 #         i_out.req <= '1';
1538 #         i_out.nia <= x"0000000000000004";
1539 #
1540 #         wait for 30*clk_period;
1541 #         wait until rising_edge(clk);
1542 #
1543 #         assert i_in.valid = '1' severity failure;
1544 #         assert i_in.insn = x"00000001"
1545 #           report "insn @" & to_hstring(i_out.nia) &
1546 #           "=" & to_hstring(i_in.insn) &
1547 #           " expected 00000001"
1548 #           severity failure;
1549 #
1550 #         i_out.req <= '0';
1551 #
1552 #         wait until rising_edge(clk);
1553 #
1554 #         -- hit
1555 #         i_out.req <= '1';
1556 #         i_out.nia <= x"0000000000000008";
1557 #         wait until rising_edge(clk);
1558 #         wait until rising_edge(clk);
1559 #         assert i_in.valid = '1' severity failure;
1560 #         assert i_in.insn = x"00000002"
1561 #           report "insn @" & to_hstring(i_out.nia) &
1562 #           "=" & to_hstring(i_in.insn) &
1563 #           " expected 00000002"
1564 #           severity failure;
1565 #         wait until rising_edge(clk);
1566 #
1567 #         -- another miss
1568 #         i_out.req <= '1';
1569 #         i_out.nia <= x"0000000000000040";
1570 #
1571 #         wait for 30*clk_period;
1572 #         wait until rising_edge(clk);
1573 #
1574 #         assert i_in.valid = '1' severity failure;
1575 #         assert i_in.insn = x"00000010"
1576 #           report "insn @" & to_hstring(i_out.nia) &
1577 #           "=" & to_hstring(i_in.insn) &
1578 #           " expected 00000010"
1579 #           severity failure;
1580 #
1581 #         -- test something that aliases
1582 #         i_out.req <= '1';
1583 #         i_out.nia <= x"0000000000000100";
1584 #         wait until rising_edge(clk);
1585 #         wait until rising_edge(clk);
1586 #         assert i_in.valid = '0' severity failure;
1587 #         wait until rising_edge(clk);
1588 #
1589 #         wait for 30*clk_period;
1590 #         wait until rising_edge(clk);
1591 #
1592 #         assert i_in.valid = '1' severity failure;
1593 #         assert i_in.insn = x"00000040"
1594 #           report "insn @" & to_hstring(i_out.nia) &
1595 #           "=" & to_hstring(i_in.insn) &
1596 #           " expected 00000040"
1597 #           severity failure;
1598 #
1599 #         i_out.req <= '0';
1600 #
1601 #         std.env.finish;
1602 #     end process;
1603 # end;
1604 def icache_sim(dut):
1605     i_out, i_in, m_out, m_in = dut.i_out, dut.i_in, dut.m_out, dut.m_in
1606
1607     yield i_out.req.eq(0)
1608     yield i_out.nia.eq(~1)
1609     yield i_out.stop_mark.eq(0)
1610     yield m_out.tlbld.eq(0)
1611     yield m_out.tlbie.eq(0)
1612     yield m_out.addr.eq(~1)
1613     yield m_out.pte.eq(~1)
1614     yield
1615     yield
1616     yield
1617     yield
1618     yield i_out.req.eq(1)
1619     yield i_out.nia.eq(Const(0x0000000000000004, 64))
1620     for i in range(30):
1621         yield
1622     yield
1623     assert i_in.valid
1624     assert i_in.insn == Const(0x00000001, 32), \
1625         ("insn @%x=%x expected 00000001" % i_out.nia, i_in.insn)
1626     yield i_out.req.eq(0)
1627     yield
1628
1629     # hit
1630     yield i_out.req.eq(1)
1631     yield i_out.nia.eq(Const(0x0000000000000008, 64))
1632     yield
1633     yield
1634     assert i_in.valid
1635     assert i_in.insn == Const(0x00000002, 32), \
1636         ("insn @%x=%x expected 00000002" % i_out.nia, i_in.insn)
1637     yield
1638
1639     # another miss
1640     yield i_out.req(1)
1641     yield i_out.nia.eq(Const(0x0000000000000040, 64))
1642     for i in range(30):
1643         yield
1644     yield
1645     assert i_in.valid
1646     assert i_in.insn == Const(0x00000010, 32), \
1647         ("insn @%x=%x expected 00000010" % i_out.nia, i_in.insn)
1648
1649     # test something that aliases
1650     yield i_out.req.eq(1)
1651     yield i_out.nia.eq(Const(0x0000000000000100, 64))
1652     yield
1653     yield
1654     assert i_in.valid
1655     for i in range(30):
1656         yield
1657     yield
1658     assert i_in.valid
1659     assert i_in.insn == Const(0x00000040, 32), \
1660          ("insn @%x=%x expected 00000040" % i_out.nia, i_in.insn)
1661     yield i_out.req.eq(0)
1662
1663
1664 def test_icache():
1665     dut = ICache()
1666
1667     m = Module()
1668     m.submodules.icache = dut
1669
1670     # nmigen Simulation
1671     sim = Simulator(m)
1672     sim.add_clock(1e-6)
1673
1674     sim.add_sync_process(wrap(icache_sim(dut)))
1675     with sim.write_vcd('test_icache.vcd'):
1676         sim.run()
1677
1678 if __name__ == '__main__':
1679     dut = ICache()
1680     vl = rtlil.convert(dut, ports=[])
1681     with open("test_icache.il", "w") as f:
1682         f.write(vl)
1683
1684     test_icache()