add fixed PLRU
[nmutil.git] / src / nmutil / plru2.py
index aab9ac8d45852c82f0cec764e023421269dd4f0e..d766f6c16b44ea7e5b4c2952a09c9f6004b1431e 100644 (file)
@@ -1,12 +1,12 @@
-# based on ariane plru, from tlb.sv
+# based on microwatt plru.vhdl
+# https://github.com/antonblanchard/microwatt/blob/f67b1431655c291fc1c99857a5c1ef624d5b264c/plru.vhdl
 
 # new PLRU API, once all users have migrated to new API in plru2.py, then
 # plru2.py will be renamed to plru.py.
 
-from nmigen import Signal, Module, Cat, Const, Repl, Array
-from nmigen.hdl.ir import Elaboratable
+from nmigen.hdl.ir import Elaboratable, Display, Signal, Array, Const, Value
+from nmigen.hdl.dsl import Module
 from nmigen.cli import rtlil
-from nmigen.utils import log2_int
 from nmigen.lib.coding import Decoder
 
 
@@ -17,149 +17,161 @@ class PLRU(Elaboratable):
         lvl0        0
                    / \
                   /   \
-        lvl1     1     2
-                / \   / \
-        lvl2   3   4 5   6
-              / \ /\/\  /\
+                 /     \
+        lvl1    1       2
+               / \     / \
+        lvl2  3   4   5   6
+             / \ / \ / \ / \
              ... ... ... ...
     """
 
-    def __init__(self, BITS):
-        self.BITS = BITS
-        self.acc_i = Signal(BITS)
-        self.acc_en = Signal()
-        self.lru_o = Signal(BITS)
-
-        self._plru_tree = Signal(self.TLBSZ)
+    def __init__(self, log2_num_ways, debug=False):
+        # type: (int, bool) -> None
+        """
+        Arguments:
+        log2_num_ways: int
+            the log-base-2 of the number of cache ways -- BITS in plru.vhdl
+        debug: bool
+            true if this should print debugging messages at simulation time.
+        """
+        assert log2_num_ways > 0
+        self.log2_num_ways = log2_num_ways
+        self.debug = debug
+        self.acc_i = Signal(log2_num_ways)
+        self.acc_en_i = Signal()
+        self.lru_o = Signal(log2_num_ways)
+
+        def mk_tree(i):
+            return Signal(name=f"tree_{i}", reset=0)
+
+        # original vhdl has array 1 too big, last entry is never used,
+        # subtract 1 to compensate
+        self._tree = Array(mk_tree(i) for i in range(self.num_ways - 1))
         """ exposed only for testing """
 
-    @property
-    def TLBSZ(self):
-        return 2 * (self.BITS - 1)
-
-    def elaborate(self, platform=None):
-        m = Module()
+        def mk_node(i, prefix):
+            return Signal(range(self.num_ways), name=f"{prefix}_node_{i}",
+                          reset=0)
 
-        # Tree (bit per entry)
-
-        # Just predefine which nodes will be set/cleared
-        # E.g. for a TLB with 8 entries, the for-loop is semantically
-        # equivalent to the following pseudo-code:
-        # unique case (1'b1)
-        # acc_en[7]: plru_tree[0, 2, 6] = {1, 1, 1};
-        # acc_en[6]: plru_tree[0, 2, 6] = {1, 1, 0};
-        # acc_en[5]: plru_tree[0, 2, 5] = {1, 0, 1};
-        # acc_en[4]: plru_tree[0, 2, 5] = {1, 0, 0};
-        # acc_en[3]: plru_tree[0, 1, 4] = {0, 1, 1};
-        # acc_en[2]: plru_tree[0, 1, 4] = {0, 1, 0};
-        # acc_en[1]: plru_tree[0, 1, 3] = {0, 0, 1};
-        # acc_en[0]: plru_tree[0, 1, 3] = {0, 0, 0};
-        # default: begin /* No hit */ end
-        # endcase
-
-        LOG_TLB = log2_int(self.BITS, False)
-        hit = Signal(self.BITS, reset_less=True)
-        m.d.comb += hit.eq(Repl(self.acc_en, self.BITS) & self.acc_i)
-
-        for i in range(self.BITS):
-            # we got a hit so update the pointer as it was least recently used
-            with m.If(hit[i]):
-                # Set the nodes to the values we would expect
-                for lvl in range(LOG_TLB):
-                    idx_base = (1 << lvl)-1
-                    # lvl0 <=> MSB, lvl1 <=> MSB-1, ...
-                    shift = LOG_TLB - lvl
-                    new_idx = Const(~((i >> (shift-1)) & 1), 1)
-                    plru_idx = idx_base + (i >> shift)
-                    # print("plru", i, lvl, hex(idx_base),
-                    #      plru_idx, shift, new_idx)
-                    m.d.sync += self._plru_tree[plru_idx].eq(new_idx)
-
-        # Decode tree to write enable signals
-        # Next for-loop basically creates the following logic for e.g.
-        # an 8 entry TLB (note: pseudo-code obviously):
-        # replace_en[7] = &plru_tree[ 6, 2, 0]; #plru_tree[0,2,6]=={1,1,1}
-        # replace_en[6] = &plru_tree[~6, 2, 0]; #plru_tree[0,2,6]=={1,1,0}
-        # replace_en[5] = &plru_tree[ 5,~2, 0]; #plru_tree[0,2,5]=={1,0,1}
-        # replace_en[4] = &plru_tree[~5,~2, 0]; #plru_tree[0,2,5]=={1,0,0}
-        # replace_en[3] = &plru_tree[ 4, 1,~0]; #plru_tree[0,1,4]=={0,1,1}
-        # replace_en[2] = &plru_tree[~4, 1,~0]; #plru_tree[0,1,4]=={0,1,0}
-        # replace_en[1] = &plru_tree[ 3,~1,~0]; #plru_tree[0,1,3]=={0,0,1}
-        # replace_en[0] = &plru_tree[~3,~1,~0]; #plru_tree[0,1,3]=={0,0,0}
-        # For each entry traverse the tree. If every tree-node matches
-        # the corresponding bit of the entry's index, this is
-        # the next entry to replace.
-        replace = []
-        for i in range(self.BITS):
-            en = []
-            for lvl in range(LOG_TLB):
-                idx_base = (1 << lvl)-1
-                # lvl0 <=> MSB, lvl1 <=> MSB-1, ...
-                shift = LOG_TLB - lvl
-                new_idx = (i >> (shift-1)) & 1
-                plru_idx = idx_base + (i >> shift)
-                plru = Signal(reset_less=True,
-                              name="plru-%d-%d-%d-%d" %
-                              (i, lvl, plru_idx, new_idx))
-                m.d.comb += plru.eq(self._plru_tree[plru_idx])
-                if new_idx:
-                    en.append(~plru)  # yes inverted (using bool() below)
-                else:
-                    en.append(plru)  # yes inverted (using bool() below)
-            #print("plru", i, en)
-            # boolean logic manipulation:
-            # plru0 & plru1 & plru2 == ~(~plru0 | ~plru1 | ~plru2)
-            replace.append(~Cat(*en).bool())
-        m.d.comb += self.lru_o.eq(Cat(*replace))
+        nodes_range = range(self.log2_num_ways)
 
-        return m
-
-    def ports(self):
-        return [self.acc_en, self.lru_o, self.acc_i]
+        self._get_lru_nodes = [mk_node(i, "get_lru") for i in nodes_range]
+        """ exposed only for testing """
 
+        self._upd_lru_nodes = [mk_node(i, "upd_lru") for i in nodes_range]
+        """ exposed only for testing """
 
-class PLRUs(Elaboratable):
-    def __init__(self, n_plrus, n_bits):
-        self.n_plrus = n_plrus
-        self.n_bits = n_bits
-        self.valid = Signal()
-        self.way = Signal(n_bits)
-        self.index = Signal(n_plrus.bit_length())
-        self.isel = Signal(n_plrus.bit_length())
-        self.o_index = Signal(n_bits)
+    @property
+    def num_ways(self):
+        return 1 << self.log2_num_ways
+
+    def _display(self, msg, *args):
+        if not self.debug:
+            return []
+        # work around not yet having
+        # https://gitlab.com/nmigen/nmigen/-/merge_requests/10
+        # by sending through Value.cast()
+        return [Display(msg, *map(Value.cast, args))]
+
+    def _get_lru(self, m):
+        """ get_lru process in plru.vhdl """
+        # XXX Check if we can turn that into a little ROM instead that
+        # takes the tree bit vector and returns the LRU. See if it's better
+        # in term of FPGA resource usage...
+        m.d.comb += self._get_lru_nodes[0].eq(0)
+        for i in range(self.log2_num_ways):
+            node = self._get_lru_nodes[i]
+            val = self._tree[node]
+            m.d.comb += self._display("GET: i:%i node:%#x val:%i",
+                                      i, node, val)
+            m.d.comb += self.lru_o[self.log2_num_ways - 1 - i].eq(val)
+            if i != self.log2_num_ways - 1:
+                # modified from microwatt version, it uses `node * 2` value
+                # to index into tree, rather than using node like is used
+                # earlier in this loop iteration
+                node <<= 1
+                with m.If(val):
+                    m.d.comb += self._get_lru_nodes[i + 1].eq(node + 2)
+                with m.Else():
+                    m.d.comb += self._get_lru_nodes[i + 1].eq(node + 1)
+
+    def _update_lru(self, m):
+        """ update_lru process in plru.vhdl """
+        with m.If(self.acc_en_i):
+            m.d.comb += self._upd_lru_nodes[0].eq(0)
+            for i in range(self.log2_num_ways):
+                node = self._upd_lru_nodes[i]
+                abit = self.acc_i[self.log2_num_ways - 1 - i]
+                m.d.sync += [
+                    self._tree[node].eq(~abit),
+                    self._display("UPD: i:%i node:%#x val:%i",
+                                  i, node, ~abit),
+                ]
+                if i != self.log2_num_ways - 1:
+                    node <<= 1
+                    with m.If(abit):
+                        m.d.comb += self._upd_lru_nodes[i + 1].eq(node + 2)
+                    with m.Else():
+                        m.d.comb += self._upd_lru_nodes[i + 1].eq(node + 1)
 
-    def elaborate(self, platform):
-        """Generate TLB PLRUs
-        """
+    def elaborate(self, platform=None):
         m = Module()
-        comb = m.d.comb
-
-        if self.n_plrus == 0:
-            return m
-
-        # Binary-to-Unary one-hot, enabled by valid
-        m.submodules.te = te = Decoder(self.n_plrus)
-        comb += te.n.eq(~self.valid)
-        comb += te.i.eq(self.index)
-
-        out = Array(Signal(self.n_bits, name="plru_out%d" % x)
-                    for x in range(self.n_plrus))
-
-        for i in range(self.n_plrus):
-            # PLRU interface
-            m.submodules["plru_%d" % i] = plru = PLRU(self.n_bits)
-
-            comb += plru.acc_en.eq(te.o[i])
-            comb += plru.acc_i.eq(self.way)
-            comb += out[i].eq(plru.lru_o)
-
-        # select output based on index
-        comb += self.o_index.eq(out[self.isel])
-
+        self._get_lru(m)
+        self._update_lru(m)
         return m
 
+    def __iter__(self):
+        yield self.acc_i
+        yield self.acc_en_i
+        yield self.lru_o
+
     def ports(self):
-        return [self.valid, self.way, self.index, self.isel, self.o_index]
+        return list(self)
+
+
+# FIXME: convert PLRUs to new API
+# class PLRUs(Elaboratable):
+#     def __init__(self, n_plrus, n_bits):
+#         self.n_plrus = n_plrus
+#         self.n_bits = n_bits
+#         self.valid = Signal()
+#         self.way = Signal(n_bits)
+#         self.index = Signal(n_plrus.bit_length())
+#         self.isel = Signal(n_plrus.bit_length())
+#         self.o_index = Signal(n_bits)
+#
+#     def elaborate(self, platform):
+#         """Generate TLB PLRUs
+#         """
+#         m = Module()
+#         comb = m.d.comb
+#
+#         if self.n_plrus == 0:
+#             return m
+#
+#         # Binary-to-Unary one-hot, enabled by valid
+#         m.submodules.te = te = Decoder(self.n_plrus)
+#         comb += te.n.eq(~self.valid)
+#         comb += te.i.eq(self.index)
+#
+#         out = Array(Signal(self.n_bits, name="plru_out%d" % x)
+#                     for x in range(self.n_plrus))
+#
+#         for i in range(self.n_plrus):
+#             # PLRU interface
+#             m.submodules["plru_%d" % i] = plru = PLRU(self.n_bits)
+#
+#             comb += plru.acc_en.eq(te.o[i])
+#             comb += plru.acc_i.eq(self.way)
+#             comb += out[i].eq(plru.lru_o)
+#
+#         # select output based on index
+#         comb += self.o_index.eq(out[self.isel])
+#
+#         return m
+#
+#     def ports(self):
+#         return [self.valid, self.way, self.index, self.isel, self.o_index]
 
 
 if __name__ == '__main__':
@@ -168,7 +180,7 @@ if __name__ == '__main__':
     with open("test_plru.il", "w") as f:
         f.write(vl)
 
-    dut = PLRUs(4, 2)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_plrus.il", "w") as f:
-        f.write(vl)
+    dut = PLRUs(4, 2)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_plrus.il", "w") as f:
+        f.write(vl)